In [1]:
import os
import json
from metrics import *
from sklearn.metrics import make_scorer, cohen_kappa_score, balanced_accuracy_score
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Load features and labels
X = np.load('../../features.npy')
y = np.load('../../labels.npy')

X.shape, y.shape

((8724, 2048), (8724,))

In [3]:
# Count the number of samples in each class
print('Number of samples in each class:')
print(np.unique(y, return_counts=True))

# Define class names
class_names = ["Few", "Many", "None"]
print('Class names:')
print(class_names)

Number of samples in each class:
(array([0, 1, 2]), array([6232,  256, 2236]))
Class names:
['Few', 'Many', 'None']


In [4]:
os.makedirs('logs', exist_ok=True)
os.makedirs('figures', exist_ok=True)

In [4]:
def custom_scorer(y_true, y_pred):
    # If y_pred is probabilities, convert to class predictions
    if y_pred.ndim == 2:
        y_pred_class = np.argmax(y_pred, axis=1)
    else:
        y_pred_class = y_pred

    # Calculate metrics
    auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovo')
    f1 = f1_score(y_true, y_pred_class, average='macro')
    kappa = cohen_kappa_score(y_true, y_pred_class)
    balanced_acc = balanced_accuracy_score(y_true, y_pred_class)
    
    return 0.3 * auc + 0.3 * f1 + 0.2 * kappa + 0.2 * balanced_acc

custom_score = make_scorer(custom_scorer, greater_is_better=True, needs_proba=True)




In [7]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_grid,
    n_iter=100, 
    cv=cv, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1,
    scoring=custom_score
)

In [7]:
# Fit RandomizedSearchCV
random_search.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  14.9s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.5s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  19.4s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  19.7s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  19.7s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  19.9s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  20.2s
[CV] END class_weight=balanced_subsample, m

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20, 'class_weight': None}


KeyboardInterrupt: 

In [8]:
# # Get the best parameters
# best_params = random_search.best_params_
# print("Best parameters:", best_params)

# Load the best parameters from a json file
with open('random_forest_optimized_best_params.json', 'r') as f:
    best_params = json.load(f)

# Now, let's perform a final 10-fold cross-validation using the best model
best_rf = RandomForestClassifier(**best_params, random_state=42)

fold_results = []

for fold, (train_index, val_index) in enumerate(cv.split(X, y), 1):
    print(f"Fold {fold}")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    best_rf.fit(X_train, y_train)
    y_pred = best_rf.predict(X_val)
    y_prob = best_rf.predict_proba(X_val)
    
    # Calculate metrics
    accuracy, class_metrics, auc, f1, cm, avg_sensitivity, avg_specificity = calculate_metrics(y_val, y_pred, y_prob)
    
    # Calculate AUC for double dichotomy
    double_dichotomy_auc = calculate_double_dichotomy_auc(y_val, y_prob)

    # Log metrics for this fold
    metrics = {
        'fold': fold,
        'val_accuracy': accuracy,
        'val_auc': auc,
        'val_f1': f1,
        'avg_sensitivity': avg_sensitivity,
        'avg_specificity': avg_specificity,
        **{f'class_{class_names[i]}_sensitivity': metrics["sensitivity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_specificity': metrics["specificity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_f1': 2 * metrics["sensitivity"] * metrics["specificity"] / (metrics["sensitivity"] + metrics["specificity"]) for i, metrics in enumerate(class_metrics)},
        **double_dichotomy_auc
    }
    
    custom_log(metrics, model_name=f'random_forest_optimized_{fold}', log_dir='logs')
    print("Metrics for this fold:")
    print(metrics)

    # Plot confusion matrix for this fold
    plot_confusion_matrix(cm, class_names=class_names, epoch_num=0, model_name='random_forest_optimized', fold_num=fold)
    
    fold_results.append(metrics)

# Calculate and print average results across all folds
avg_results = {key: np.mean([fold[key] for fold in fold_results if key in fold]) 
               for key in fold_results[0].keys() if key != 'fold'}

print("Average results across all folds:")
for key, value in avg_results.items():
    print(f"{key}: {value}")

# Log average results
custom_log(avg_results, model_name='random_forest_optimized_average', log_dir='logs')

Fold 1
Metrics for this fold:
{'fold': 1, 'val_accuracy': 0.9879656160458453, 'val_auc': 0.9988192149439342, 'val_f1': 0.9879417164496996, 'avg_sensitivity': 0.9881672763326989, 'avg_specificity': 0.9893041171282024, 'class_Few_sensitivity': 0.9935846030473136, 'class_Many_sensitivity': 1.0, 'class_None_sensitivity': 0.970917225950783, 'class_Few_specificity': 0.9738955823293173, 'class_Many_specificity': 0.999409681227863, 'class_None_specificity': 0.9946070878274268, 'class_Few_f1': 0.9836415764390248, 'class_Many_f1': 0.9997047534691467, 'class_None_f1': 0.9826193935684477, 'auc_normal_vs_abnormal': 0.9988090436844844, 'auc_few_vs_many': 1.0}
Fold 2
Metrics for this fold:
{'fold': 2, 'val_accuracy': 0.9828080229226361, 'val_auc': 0.9986154802163035, 'val_f1': 0.9828477000936321, 'avg_sensitivity': 0.9867183103729338, 'avg_specificity': 0.9878179464568162, 'class_Few_sensitivity': 0.9847634322373697, 'class_Many_sensitivity': 1.0, 'class_None_sensitivity': 0.9753914988814317, 'class_