In [1]:
import numpy as np
import json
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

class GeneralizedSocialDistanceKNN:
    def __init__(self, n_neighbors=3, k=2, X=None, y=None, file_name="accuracy_result.json"):
        self.n_neighbors = n_neighbors
        self.k = k
        self.knn = None
        self.file_name = file_name  # File name to save results

        # Use passed dataset or default to Iris dataset
        if X is None or y is None:
            data = load_iris()  # Load the iris dataset
            self.X = data.data
            self.y = data.target
        else:
            self.X = X  # Features
            self.y = y  # Labels

        # Split dataset into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.1, random_state=42)

        # Standardize the dataset
        self.scaler = StandardScaler()
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)

    # Generalized social distance function
    def generalized_social_distance(self, x, y, data_set=None):
        def euclidean_distance(p1, p2):
            return np.sqrt(np.sum((p1 - p2) ** 2))

        distances_x = [euclidean_distance(x, point) for point in data_set]
        distances_y = [euclidean_distance(y, point) for point in data_set]

        mx_y = sum(1 for dx in distances_x if 0 < dx < euclidean_distance(x, y))
        mx_eq = sum(1 for dx in distances_x if dx == euclidean_distance(x, y))
        my_x = sum(1 for dy in distances_y if 0 < dy < euclidean_distance(x, y))
        my_eq = sum(1 for dy in distances_y if dy == euclidean_distance(x, y))

        if mx_y + mx_eq == 0 or my_x + my_eq == 0:
            return float('inf')

        lk_distance = (mx_y**self.k + mx_eq**self.k) / (mx_y + mx_eq) + (my_x**self.k + my_eq**self.k) / (my_x + my_eq)
        generalized_social_distance = lk_distance / (1 + lk_distance)

        return generalized_social_distance

    # Wrapper for the KNN distance metric
    def generalized_social_distance_wrapper(self, x, y):
        return self.generalized_social_distance(x, y, data_set=self.X_train)

    # Train the KNN model
    def train_knn(self):
        self.knn = KNeighborsClassifier(n_neighbors=self.n_neighbors, metric=self.generalized_social_distance_wrapper,n_jobs=-1)
        self.knn.fit(self.X_train, self.y_train)

    # Predict and evaluate the model
    def evaluate(self):
        y_pred_knn = self.knn.predict(self.X_test)
        knn_accuracy = accuracy_score(self.y_test, y_pred_knn)
        return knn_accuracy

    # Save accuracy to the specified JSON file
    def save_accuracy_to_json(self):
        accuracy = self.evaluate()

        try:
            # Read the existing data in the JSON file
            with open(self.file_name, 'r') as file:
                results = json.load(file)
        except FileNotFoundError:
            # If file doesn't exist, create an empty dictionary
            results = {}

        # Add the current accuracy for the specific `n_neighbors` value under the dataset key (using the file name as the key)
        dataset_key = self.file_name.split('.')[0]  # Extract dataset name from file name
        if dataset_key not in results:
            results[dataset_key] = {}

        results[dataset_key][str(self.n_neighbors)] = accuracy

        # Write the updated results back to the JSON file
        with open(self.file_name, 'w') as file:
            json.dump(results, file, indent=4)

# Usage Example: Run KNN with n_neighbors from 1 to 15 and save accuracies to a custom JSON file


In [2]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

def fetch_and_prepare_data(dataset_id):
    # Fetch dataset
    dataset = fetch_ucirepo(id=dataset_id)
    
    # Extract features and targets
    X = dataset.data.features
    y = dataset.data.targets.squeeze()
    
    # Ensure X is a DataFrame
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    
    # Convert categorical columns to numerical
    for column in X.columns:
        if X[column].dtype == 'object':
            X = pd.get_dummies(X, columns=[column], drop_first=True)
    
    return X, y


In [3]:
def run_knn(n_neighbors, X, y, file_name):
    print(f"Running KNN with n_neighbors={n_neighbors}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=n_neighbors, X=X, y=y, file_name=file_name)
    model.train_knn()
    model.save_accuracy_to_json()


In [4]:
from joblib import Parallel, delayed

def parallel_knn(dataset_id, file_name, n_jobs=-1):
    # Fetch and prepare data
    X, y = fetch_and_prepare_data(dataset_id)
    
    # Run KNN in parallel
    results = Parallel(n_jobs=n_jobs)(
        delayed(run_knn)(k, X, y, file_name) for k in range(1, 16)
    )


In [None]:
%%time
parallel_knn(dataset_id=176, file_name="blood_transfusion.json")


In [18]:
%%time
for k in range(1, 16):
    print(f"Running KNN with n_neighbors={k}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=k, file_name="iris.json")
    model.train_knn()
    model.save_accuracy_to_json()


Running KNN with n_neighbors=1
Running KNN with n_neighbors=2
Running KNN with n_neighbors=3
Running KNN with n_neighbors=4
Running KNN with n_neighbors=5
Running KNN with n_neighbors=6
Running KNN with n_neighbors=7
Running KNN with n_neighbors=8
Running KNN with n_neighbors=9
Running KNN with n_neighbors=10
Running KNN with n_neighbors=11
Running KNN with n_neighbors=12
Running KNN with n_neighbors=13
Running KNN with n_neighbors=14
Running KNN with n_neighbors=15
CPU times: user 1min 13s, sys: 970 ms, total: 1min 14s
Wall time: 1min 12s


In [8]:
%%time
from joblib import Parallel, delayed
import json

# Assuming GeneralizedSocialDistanceKNN is defined elsewhere

# Time the execution of the KNN models


# Function to train the model and save accuracy
def run_knn(n_neighbors):
    print(f"Running KNN with n_neighbors={n_neighbors}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=n_neighbors, file_name="iris.json")
    model.train_knn()
    model.save_accuracy_to_json()
 

# Use Parallel for parallel processing
results = Parallel(n_jobs=-1)(delayed(run_knn)(k) for k in range(1, 16))

# # Optionally, save results to a JSON file after the loop
# with open("knn_results.json", "w") as f:
#     json.dump(results, f)

        # Optionally, you can append results to a list if you want to keep track of them
       



JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [17]:
%%time

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
ecoli = fetch_ucirepo(id=39) 
  
# data (as pandas dataframes) 
X = ecoli.data.features 
y = np.squeeze(ecoli.data.targets)

for k in range(1, 16):
    print(f"Running KNN with n_neighbors={k}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=k,  X=X, y=y,file_name="ecoli.json")
    model.train_knn()
    model.save_accuracy_to_json()

Running KNN with n_neighbors=1
Running KNN with n_neighbors=2
Running KNN with n_neighbors=3
Running KNN with n_neighbors=4
Running KNN with n_neighbors=5
Running KNN with n_neighbors=6
Running KNN with n_neighbors=7
Running KNN with n_neighbors=8
Running KNN with n_neighbors=9
Running KNN with n_neighbors=10
Running KNN with n_neighbors=11
Running KNN with n_neighbors=12
Running KNN with n_neighbors=13
Running KNN with n_neighbors=14
Running KNN with n_neighbors=15


In [10]:
%%time
import numpy as np
from joblib import Parallel, delayed
from ucimlrepo import fetch_ucirepo

# Assuming GeneralizedSocialDistanceKNN is defined elsewhere

# Fetch dataset
ecoli = fetch_ucirepo(id=39)

# Data (as pandas dataframes)
X = ecoli.data.features
y = np.squeeze(ecoli.data.targets)



# Function to train the model and save accuracy
def run_knn(n_neighbors):
    print(f"Running KNN with n_neighbors={n_neighbors}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=n_neighbors, X=X, y=y, file_name="ecoli.json")
    model.train_knn()
    model.save_accuracy_to_json()

# Use Parallel for parallel processing
results = Parallel(n_jobs=-1)(delayed(run_knn)(k) for k in range(1, 16))


CPU times: user 244 ms, sys: 272 ms, total: 516 ms
Wall time: 2min 14s


In [15]:
%%time
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from ucimlrepo import fetch_ucirepo

# Fetch dataset
breast_cancer = fetch_ucirepo(id=14)

# Data (as pandas dataframes)
X = breast_cancer.data.features
y = np.squeeze(breast_cancer.data.targets)

# Convert to DataFrame if it's not already one
if not isinstance(X, pd.DataFrame):
    X = pd.DataFrame(X)

# Print data types to identify any non-numeric columns (optional)
# print(X.dtypes)

# Convert categorical columns to numerical using one-hot encoding
for column in X.columns:
    if X[column].dtype == 'object':  # Check if the column is categorical
        X = pd.get_dummies(X, columns=[column], drop_first=True)

# Time the execution of the KNN models


# Function to train the model and save accuracy
def run_knn(n_neighbors):
    print(f"Running KNN with n_neighbors={n_neighbors}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=n_neighbors, X=X, y=y, file_name="breast_cancer.json")
    model.train_knn()
    model.save_accuracy_to_json()

# Use Parallel for parallel processing
results = Parallel(n_jobs=-1)(delayed(run_knn)(k) for k in range(1, 16))


CPU times: user 208 ms, sys: 240 ms, total: 448 ms
Wall time: 1min 4s


In [None]:
%%time
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
blood_transfusion_service_center = fetch_ucirepo(id=176) 
  
# data (as pandas dataframes) 
X = blood_transfusion_service_center.data.features 
y = np.squeeze(blood_transfusion_service_center.data.targets)


for k in range(1, 16):
    print(f"Running KNN with n_neighbors={k}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=k,  X=X, y=y,file_name="blood_transfusion.json")
    model.train_knn()
    model.save_accuracy_to_json()

Running KNN with n_neighbors=1
Running KNN with n_neighbors=2


KeyboardInterrupt: 

In [None]:
%%time
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo

# Fetch dataset
blood_transfusion_service_center = fetch_ucirepo(id=176)

# Data (as pandas dataframes)
X = blood_transfusion_service_center.data.features
y = np.squeeze(blood_transfusion_service_center.data.targets)

# Convert to DataFrame if it's not already one
if not isinstance(X, pd.DataFrame):
    X = pd.DataFrame(X)

# Convert categorical columns to numerical using one-hot encoding
for column in X.columns:
    if X[column].dtype == 'object':  # Check if the column is categorical
        X = pd.get_dummies(X, columns=[column], drop_first=True)

# Function to train the model and save accuracy
def run_knn(n_neighbors):
    print(f"Running KNN with n_neighbors={n_neighbors}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=n_neighbors, X=X, y=y, file_name="blood_transfusion.json")
    model.train_knn()
    model.save_accuracy_to_json()

# Use Parallel for parallel processing
results = Parallel(n_jobs=-1)(delayed(run_knn)(k) for k in range(1, 16))


Running KNN with n_neighbors=7
Running KNN with n_neighbors=6
Running KNN with n_neighbors=10
Running KNN with n_neighbors=11
Running KNN with n_neighbors=8


Mixed data set

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
acute_inflammations = fetch_ucirepo(id=184) 
  
# data (as pandas dataframes) 
X = acute_inflammations.data.features 
y = acute_inflammations.data.targets 
  

for k in range(1, 16):
    print(f"Running KNN with n_neighbors={k}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=k,  X=X, y=y,file_name="acute_inflammation.json")
    model.train_knn()
    model.save_accuracy_to_json()

In [None]:
%%time
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
contraceptive_method_choice = fetch_ucirepo(id=30) 
  
# data (as pandas dataframes) 
X = contraceptive_method_choice.data.features 
y = contraceptive_method_choice.data.targets 



for k in range(1, 16):
    print(f"Running KNN with n_neighbors={k}")
    model = GeneralizedSocialDistanceKNN(n_neighbors=k,  X=X, y=y,file_name="contraceptive_method_choice.json")
    model.train_knn()
    model.save_accuracy_to_json()

In [None]:
%%time
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
liver_disorders = fetch_ucirepo(id=60) 
  
# data (as pandas dataframes) 
X = liver_disorders.data.features 
y = liver_disorders.data.targets 
