In [1]:
import os
import sys
import json
from sklearn.metrics import make_scorer, cohen_kappa_score, balanced_accuracy_score, roc_auc_score
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Add the directory two levels up to sys.path
grandparent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.insert(0, grandparent_dir)

# Now you can import the module from the grandparent directory
try:
    from metrics import *
    print("Module 'metrics' imported successfully.")
except ModuleNotFoundError as e:
    print(f"Error: {e}")
    print("Ensure that 'metrics.py' is located in the grandparent directory.")

Module 'metrics' imported successfully.


In [2]:
# Load features and labels
X = np.load('../../features.npy')
y = np.load('../../labels.npy')

X.shape, y.shape

((8724, 2048), (8724,))

In [3]:
# Count the number of samples in each class
print('Number of samples in each class:')
print(np.unique(y, return_counts=True))

# Define class names
class_names = ["Few", "Many", "None"]
print('Class names:')
print(class_names)

Number of samples in each class:
(array([0, 1, 2]), array([6232,  256, 2236]))
Class names:
['Few', 'Many', 'None']


In [4]:
os.makedirs('logs', exist_ok=True)
os.makedirs('figures', exist_ok=True)

In [5]:
def custom_scorer(y_true, y_pred):
    # If y_pred is probabilities, convert to class predictions
    if y_pred.ndim == 2:
        y_pred_class = np.argmax(y_pred, axis=1)
    else:
        y_pred_class = y_pred

    # Calculate metrics
    auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovo')
    f1 = f1_score(y_true, y_pred_class, average='macro')
    kappa = cohen_kappa_score(y_true, y_pred_class)
    balanced_acc = balanced_accuracy_score(y_true, y_pred_class)
    
    return 0.3 * auc + 0.3 * f1 + 0.2 * kappa + 0.2 * balanced_acc

custom_score = make_scorer(custom_scorer, greater_is_better=True, needs_proba=True)



In [6]:
# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2],  # Only relevant for Minkowski metric
    'leaf_size': [10, 20, 30, 40, 50],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=knn, 
    param_distributions=param_grid,
    n_iter=100, 
    cv=cv, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1,
    scoring=custom_score
)

In [7]:
random_search.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END algorithm=brute, leaf_size=50, metric=euclidean, n_neighbors=5, p=2, weights=distance; total time=   2.0s
[CV] END algorithm=brute, leaf_size=50, metric=euclidean, n_neighbors=5, p=2, weights=distance; total time=   2.2s
[CV] END algorithm=brute, leaf_size=50, metric=euclidean, n_neighbors=5, p=2, weights=distance; total time=   2.6s
[CV] END algorithm=brute, leaf_size=50, metric=euclidean, n_neighbors=5, p=2, weights=distance; total time=   2.9s
[CV] END algorithm=brute, leaf_size=50, metric=euclidean, n_neighbors=5, p=2, weights=distance; total time=   4.3s
[CV] END algorithm=ball_tree, leaf_size=10, metric=minkowski, n_neighbors=5, p=2, weights=uniform; total time=  22.2s
[CV] END algorithm=ball_tree, leaf_size=10, metric=minkowski, n_neighbors=5, p=2, weights=uniform; total time=  23.4s
[CV] END algorithm=ball_tree, leaf_size=10, metric=minkowski, n_neighbors=5, p=2, weights=uniform; total time=  23.7s
[CV] END algorithm=ball_tree, leaf_size=10, metric=minkowski, n_neighbo

In [8]:
# Save the best model
with open('best_model.json', 'w') as f:
    json.dump(random_search.best_params_, f)

In [9]:
# Get the best parameters
best_params = random_search.best_params_
print("Best parameters:", best_params)

# # Load the best parameters from a json file
# with open('knn_optimized_best_params.json', 'r') as f:
#     best_params = json.load(f)

# Now, let's perform a final 5-fold cross-validation using the best model
best_knn = KNeighborsClassifier(**best_params)

fold_results = []

for fold, (train_index, val_index) in enumerate(cv.split(X, y), 1):
    print(f"Fold {fold}")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    best_knn.fit(X_train, y_train)
    y_pred = best_knn.predict(X_val)
    y_prob = best_knn.predict_proba(X_val)
    
    # Calculate metrics
    accuracy, class_metrics, auc, f1, cm, avg_sensitivity, avg_specificity = calculate_metrics(y_val, y_pred, y_prob)
    
    # Check for NaN values in y_prob before calculating AUC
    if np.isnan(y_prob).any():
        print(f"NaN values found in y_prob for fold {fold}, skipping AUC calculation")
        double_dichotomy_auc = {'auc_few_vs_many': np.nan}
    else:
        double_dichotomy_auc = calculate_double_dichotomy_auc(y_val, y_prob)

    # Log metrics for this fold
    metrics = {
        'fold': fold,
        'val_accuracy': accuracy,
        'val_auc': auc,
        'val_f1': f1,
        'avg_sensitivity': avg_sensitivity,
        'avg_specificity': avg_specificity,
        **{f'class_{class_names[i]}_sensitivity': metrics["sensitivity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_specificity': metrics["specificity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_f1': 2 * metrics["precision"] * metrics["sensitivity"] / (metrics["precision"] + metrics["sensitivity"]) for i, metrics in enumerate(class_metrics)},
        **double_dichotomy_auc
    }
    
    custom_log(metrics, model_name=f'knn_optimized_{fold}', log_dir='logs')
    print("Metrics for this fold:")
    print(metrics)

    # Plot confusion matrix for this fold
    plot_confusion_matrix(cm, class_names=class_names, epoch_num=0, model_name='knn_optimized', fold_num=fold)
    
    fold_results.append(metrics)

# Calculate and print average results across all folds
avg_results = {key: np.mean([fold[key] for fold in fold_results if key in fold]) 
               for key in fold_results[0].keys() if key != 'fold'}

print("Average results across all folds:")
for key, value in avg_results.items():
    print(f"{key}: {value}")

# Log average results
custom_log(avg_results, model_name='knn_optimized_average', log_dir='logs')

Best parameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 11, 'metric': 'euclidean', 'leaf_size': 10, 'algorithm': 'ball_tree'}
Fold 1
Metrics for this fold:
{'fold': 1, 'val_accuracy': 0.9879656160458453, 'val_auc': 0.9986820079998584, 'val_f1': 0.9879498714537954, 'avg_sensitivity': 0.9886456802814451, 'avg_specificity': 0.9897166558294893, 'class_Few_sensitivity': 0.9927826784282278, 'class_Many_sensitivity': 1.0, 'class_None_sensitivity': 0.9731543624161074, 'class_Few_specificity': 0.9759036144578314, 'class_Many_specificity': 0.999409681227863, 'class_None_specificity': 0.9938366718027735, 'class_Few_f1': 0.9842707878347672, 'class_Many_f1': 0.9997047534691467, 'class_None_f1': 0.9833867830292835, 'auc_normal_vs_abnormal': 0.9986754704363621, 'auc_few_vs_many': 1.0}
Fold 2
Metrics for this fold:
{'fold': 2, 'val_accuracy': 0.9845272206303725, 'val_auc': 0.9951999031088391, 'val_f1': 0.984547645560286, 'avg_sensitivity': 0.9875202349920196, 'avg_specificity': 0.98858836248146