In [1]:
import os
import json
from metrics import *
from sklearn.metrics import make_scorer, cohen_kappa_score, balanced_accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.svm import SVC

In [2]:
# Load features and labels
X = np.load('../../features.npy')
y = np.load('../../labels.npy')

X.shape, y.shape

((8724, 2048), (8724,))

In [3]:
# Count the number of samples in each class
print('Number of samples in each class:')
print(np.unique(y, return_counts=True))

# Define class names
class_names = ["Few", "Many", "None"]
print('Class names:')
print(class_names)

Number of samples in each class:
(array([0, 1, 2]), array([6232,  256, 2236]))
Class names:
['Few', 'Many', 'None']


In [9]:
os.makedirs('logs', exist_ok=True)
os.makedirs('figures', exist_ok=True)

In [4]:
def custom_scorer(y_true, y_pred):
    # If y_pred is probabilities, convert to class predictions
    if y_pred.ndim == 2:
        y_pred_class = np.argmax(y_pred, axis=1)
    else:
        y_pred_class = y_pred

    # Calculate metrics
    auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovo')
    f1 = f1_score(y_true, y_pred_class, average='macro')
    kappa = cohen_kappa_score(y_true, y_pred_class)
    balanced_acc = balanced_accuracy_score(y_true, y_pred_class)
    
    return 0.3 * auc + 0.3 * f1 + 0.2 * kappa + 0.2 * balanced_acc

custom_score = make_scorer(custom_scorer, greater_is_better=True, needs_proba=True)



In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'class_weight': ['balanced', None]
}

# Initialize the SVM classifier
svm = SVC(probability=True, random_state=42)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=svm, 
    param_distributions=param_grid,
    n_iter=100, 
    cv=cv, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1,
    scoring=custom_score
)

In [6]:
random_search.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ..C=10, class_weight=None, gamma=0.1, kernel=linear; total time= 1.0min
[CV] END ..C=10, class_weight=None, gamma=0.1, kernel=linear; total time= 1.8min
[CV] END ..C=10, class_weight=None, gamma=0.1, kernel=linear; total time= 1.9min
[CV] END ..C=10, class_weight=None, gamma=0.1, kernel=linear; total time= 1.9min
[CV] END ..C=1, class_weight=None, gamma=auto, kernel=linear; total time= 1.9min
[CV] END ..C=1, class_weight=None, gamma=auto, kernel=linear; total time= 1.9min
[CV] END ..C=10, class_weight=None, gamma=0.1, kernel=linear; total time= 1.9min
[CV] END ..C=1, class_weight=None, gamma=auto, kernel=linear; total time=  58.2s
[CV] END ..C=1, class_weight=None, gamma=auto, kernel=linear; total time= 1.8min
[CV] END ......C=0.1, class_weight=None, gamma=1, kernel=rbf; total time= 3.6min
[CV] END ..C=1, class_weight=None, gamma=auto, kernel=linear; total time= 1.8min
[CV] END ......C=0.1, class_weight=None, gamma

In [8]:
# Save the best model
with open('best_model.json', 'w') as f:
    json.dump(random_search.best_params_, f)

In [10]:
# Get the best parameters
best_params = random_search.best_params_
print("Best parameters:", best_params)

# # Load the best parameters from a json file
# with open('svm_optimized_best_params.json', 'r') as f:
#     best_params = json.load(f)

# Now, let's perform a final 10-fold cross-validation using the best model
best_svm = SVC(**best_params, probability=True, random_state=42)

fold_results = []

for fold, (train_index, val_index) in enumerate(cv.split(X, y), 1):
    print(f"Fold {fold}")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    best_svm.fit(X_train, y_train)
    y_pred = best_svm.predict(X_val)
    y_prob = best_svm.predict_proba(X_val)
    
    # Calculate metrics
    accuracy, class_metrics, auc, f1, cm, avg_sensitivity, avg_specificity = calculate_metrics(y_val, y_pred, y_prob)
    
    # Calculate AUC for double dichotomy
    double_dichotomy_auc = calculate_double_dichotomy_auc(y_val, y_prob)

    # Log metrics for this fold
    metrics = {
        'fold': fold,
        'val_accuracy': accuracy,
        'val_auc': auc,
        'val_f1': f1,
        'avg_sensitivity': avg_sensitivity,
        'avg_specificity': avg_specificity,
        **{f'class_{class_names[i]}_sensitivity': metrics["sensitivity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_specificity': metrics["specificity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_f1': 2 * metrics["sensitivity"] * metrics["specificity"] / (metrics["sensitivity"] + metrics["specificity"]) for i, metrics in enumerate(class_metrics)},
        **double_dichotomy_auc
    }
    
    custom_log(metrics, model_name=f'svm_optimized_{fold}', log_dir='logs')
    print("Metrics for this fold:")
    print(metrics)

    # Plot confusion matrix for this fold
    plot_confusion_matrix(cm, class_names=class_names, epoch_num=0, model_name='svm_optimized', fold_num=fold)
    
    fold_results.append(metrics)

# Calculate and print average results across all folds
avg_results = {key: np.mean([fold[key] for fold in fold_results if key in fold]) 
               for key in fold_results[0].keys() if key != 'fold'}

print("Average results across all folds:")
for key, value in avg_results.items():
    print(f"{key}: {value}")

# Log average results
custom_log(avg_results, model_name='svm_optimized_average', log_dir='logs')

Best parameters: {'kernel': 'rbf', 'gamma': 0.1, 'class_weight': 'balanced', 'C': 1}
Fold 1
Metrics for this fold:
{'fold': 1, 'val_accuracy': 0.9805157593123209, 'val_auc': 0.9981881258560373, 'val_f1': 0.9806749571747311, 'avg_sensitivity': 0.9885195012399631, 'avg_specificity': 0.9895661193858606, 'class_Few_sensitivity': 0.9767441860465116, 'class_Many_sensitivity': 1.0, 'class_None_sensitivity': 0.9888143176733781, 'class_Few_specificity': 0.9899598393574297, 'class_Many_specificity': 0.9964580873671782, 'class_None_specificity': 0.9822804314329738, 'class_Few_f1': 0.9833076101151609, 'class_Many_f1': 0.9982259018332348, 'class_None_f1': 0.9855365451221144, 'auc_normal_vs_abnormal': 0.9985574089202801, 'auc_few_vs_many': 0.9994811075994151}
Fold 2
Metrics for this fold:
{'fold': 2, 'val_accuracy': 0.9782234957020057, 'val_auc': 0.9986423915735593, 'val_f1': 0.9783972401699138, 'avg_sensitivity': 0.9874502684145154, 'avg_specificity': 0.9883588007671399, 'class_Few_sensitivity': 0.