In [1]:
# Import necessary libraries
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier


Load and Preprocess the Data

In [2]:
# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data  # Features
y = data.target  # Labels (0: malignant, 1: benign)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the dataset (scaling features)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Define the Generalized Social Distance Function

In [7]:
# Define the generalized social distance function
def generalized_social_distance(x, y, data_set=None, k=2):
    def euclidean_distance(p1, p2):
        return np.sqrt(np.sum((p1 - p2) ** 2))

    # Calculate distances between x and all other points in the data_set using the distance_metric
    distances_x = [euclidean_distance(x, point) for point in data_set]
    distances_y = [euclidean_distance(y, point) for point in data_set]

    # Calculate ranks
    mx_y = sum(1 for dx in distances_x if 0 < dx < euclidean_distance(x, y))
    mx_eq = sum(1 for dx in distances_x if dx == euclidean_distance(x, y))
    my_x = sum(1 for dy in distances_y if 0 < dy < euclidean_distance(x, y))
    my_eq = sum(1 for dy in distances_y if dy == euclidean_distance(x, y))

    # Handle the case where mx_y + mx_eq or my_x + my_eq is zero
    if mx_y + mx_eq == 0 or my_x + my_eq == 0:
        return float('inf')

    # Calculate Lk distance
    lk_distance = (mx_y**k + mx_eq**k) / (mx_y + mx_eq) + (my_x**k + my_eq**k) / (my_x + my_eq)

    # Calculate Generalized Social Distance
    generalized_social_distance = lk_distance / (1 + lk_distance)

    return generalized_social_distance


Compute the Distance Matrix

In [None]:
# Precompute the generalized social distance matrix
def compute_distance_matrix(X_train):
    n_samples = X_train.shape[0]
    distance_matrix = np.zeros((n_samples, n_samples))

    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            dist = generalized_social_distance(X_train[i], X_train[j], data_set=X_train)
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist  # Symmetric matrix

    return distance_matrix

# Compute the distance matrix for X_train
distance_matrix_train = compute_distance_matrix(X_train)


Use Precomputed Distance Matrix with scikit-learn's KNN

In [None]:
# Function to evaluate KNN with precomputed distances for different n_neighbors
def evaluate_knn_with_precomputed(n_neighbors):
    # Initialize scikit-learn's KNeighborsClassifier with precomputed distances
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='precomputed')

    # Fit the model (using the precomputed distance matrix)
    knn.fit(distance_matrix_train, y_train)

    # Compute distance matrix for test set and train set
    n_test = X_test.shape[0]
    distance_matrix_test = np.zeros((n_test, X_train.shape[0]))
    
    for i in range(n_test):
        for j in range(X_train.shape[0]):
            distance_matrix_test[i, j] = generalized_social_distance(X_test[i], X_train[j], data_set=X_train)
    
    # Make predictions
    y_pred = knn.predict(distance_matrix_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Print the results
    print(f"KNN Accuracy with n_neighbors={n_neighbors}: {accuracy}")
    print("Confusion Matrix:")
    print(confusion)
    print("Classification Report:")
    print(report)


Cell 6: Fit the Model and Make Predictions

# Use the precomputed distance matrix for training
knn = PrecomputedDistanceKNN(n_neighbors=3, distance_matrix=distance_matrix_train)

# Fit the model using the precomputed distance matrix
knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn.predict(X_test)


Cell 7: Evaluate the Model

In [None]:
# Evaluate for different n_neighbors without recalculating the distance matrix
evaluate_knn_with_precomputed(n_neighbors=3)
evaluate_knn_with_precomputed(n_neighbors=5)
evaluate_knn_with_precomputed(n_neighbors=7)
