In [18]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score, f1_score, confusion_matrix
import json
import os
from metrics import *


In [19]:
# Load features and labels
X = np.load('../../features.npy')
y = np.load('../../labels.npy')

X.shape, y.shape

((8724, 2048), (8724,))

In [20]:
# Count the number of samples in each class
print('Number of samples in each class:')
print(np.unique(y, return_counts=True))

# Define class names
class_names = ["Few", "Many", "None"]
print('Class names:')
print(class_names)

Number of samples in each class:
(array([0, 1, 2]), array([6232,  256, 2236]))
Class names:
['Few', 'Many', 'None']


In [21]:
# Divide the data into 2 classes only (Normal and Abnormal)
y_binary = np.zeros(y.shape)
y_binary[y != 2] = 0 # Few and Many
y_binary[y == 2] = 1 # None

# Check the number of samples in each class
print('Number of samples in each class:')
print(np.unique(y_binary, return_counts=True))

# Define the new class names
binary_class_names = ["Normal", "Abnormal"]
print('Binary class names:')
print(binary_class_names)

Number of samples in each class:
(array([0., 1.]), array([6488, 2236]))
Binary class names:
['Normal', 'Abnormal']


In [22]:
# Few vs Many
y_few_many = np.zeros(y.shape)
y_few_many[y == 0] = 0 # Few
y_few_many[y == 1] = 1 # Many

# Check the number of samples in each class
print('Number of samples in each class:')
print(np.unique(y_few_many, return_counts=True))

# Define the new class names
few_many_class_names = ["Few", "Many"]
print('Few vs Many class names:')
print(few_many_class_names)

Number of samples in each class:
(array([0., 1.]), array([8468,  256]))
Few vs Many class names:
['Few', 'Many']


In [23]:
os.makedirs('logs_dd/nor_vs_abn', exist_ok=True)
os.makedirs('figures_dd/nor_vs_abn', exist_ok=True)
os.makedirs('logs_dd/few_vs_many', exist_ok=True)
os.makedirs('figures_dd/few_vs_many', exist_ok=True)

In [24]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

custom_score = make_scorer(custom_scorer, greater_is_better=True, 
                           response_method='predict_proba'
                           )

In [25]:
# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_grid,
    n_iter=100, 
    cv=cv, 
    verbose=2, 
    random_state=42, 
    n_jobs=-1,
    scoring=custom_score
)


In [26]:
# Fit the RandomizedSearchCV
random_search.fit(X, y_binary)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  15.1s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  15.5s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  15.6s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  15.8s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=  15.9s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  16.0s
[CV] END class_weight=balanced_subsample, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;

  _data = np.array(data, dtype=dtype, copy=copy,


In [27]:
# Save best parameters
with open('dd_normal_vs_abnormal_best_params.json', 'w') as f:
    json.dump(random_search.best_params_, f, indent=4)

In [33]:
# # Get the best parameters
# best_params = random_search.best_params_
# print("Best parameters:", best_params)

# Load the best parameters from a json file
with open('dd_normal_vs_abnormal_best_params.json', 'r') as f:
    best_params = json.load(f)

# Now, let's perform a final 10-fold cross-validation using the best model
best_rf = RandomForestClassifier(**best_params, random_state=42)

fold_results = []

for fold, (train_index, val_index) in enumerate(cv.split(X, y_binary), 1):
    print(f"Fold {fold}")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y_binary[train_index], y_binary[val_index]
    
    best_rf.fit(X_train, y_train)
    y_pred = best_rf.predict(X_val)
    y_prob = best_rf.predict_proba(X_val)
    
    # Calculate metrics
    accuracy, class_metrics, auc, f1, cm, avg_sensitivity, avg_specificity = calculate_metrics(y_val, y_pred, y_prob, num_classes=2)

    # Log metrics for this fold
    metrics = {
        'fold': fold,
        'val_accuracy': accuracy,
        'val_auc': auc,
        'val_f1': f1,
        'avg_sensitivity': avg_sensitivity,
        'avg_specificity': avg_specificity,
        **{f'class_{binary_class_names[i]}_sensitivity': metrics["sensitivity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{binary_class_names[i]}_specificity': metrics["specificity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{binary_class_names[i]}_f1': 2 * metrics["sensitivity"] * metrics["specificity"] / (metrics["sensitivity"] + metrics["specificity"]) for i, metrics in enumerate(class_metrics)},
    }
    
    custom_log(metrics, model_name=f'dd_random_forest_{fold}', log_dir='logs_dd/nor_vs_abn/')
    print("Metrics for this fold:")
    print(metrics)

    # Plot confusion matrix for this fold
    plot_confusion_matrix(cm, class_names=binary_class_names, epoch_num=0, model_name='dd_random_forest', fold_num=fold, save_dir='figures_dd/nor_vs_abn/')
    
    fold_results.append(metrics)

# Calculate and print average results across all folds
avg_results = {key: np.mean([fold[key] for fold in fold_results if key in fold]) 
               for key in fold_results[0].keys() if key != 'fold'}

print("Average results across all folds:")
for key, value in avg_results.items(): 
    print(f"{key}: {value}")

# Log average results
custom_log(avg_results, model_name='dd_random_forest_average', log_dir='logs_dd/nor_vs_abn/')

Fold 1
Metrics for this fold:
{'fold': 1, 'val_accuracy': 0.9759312320916905, 'val_auc': 0.9916176323581625, 'val_f1': 0.9518348623853211, 'avg_sensitivity': 0.9603537364315433, 'avg_specificity': 0.9603537364315433, 'class_Normal_sensitivity': 0.9922958397534669, 'class_Abnormal_sensitivity': 0.9284116331096197, 'class_Normal_specificity': 0.9284116331096197, 'class_Abnormal_specificity': 0.9922958397534669, 'class_Normal_f1': 0.9592913175270055, 'class_Abnormal_f1': 0.9592913175270055}
Fold 2
Metrics for this fold:
{'fold': 2, 'val_accuracy': 0.9736389684813753, 'val_auc': 0.9949078430764245, 'val_f1': 0.9484304932735426, 'avg_sensitivity': 0.964679786144921, 'avg_specificity': 0.964679786144921, 'class_Normal_sensitivity': 0.9830508474576272, 'class_Abnormal_sensitivity': 0.9463087248322147, 'class_Normal_specificity': 0.9463087248322147, 'class_Abnormal_specificity': 0.9830508474576272, 'class_Normal_f1': 0.9643299333765698, 'class_Abnormal_f1': 0.9643299333765698}
Fold 3
Metrics f