In [1]:
import os
import json
import xgboost as xgb
from metrics import *
from sklearn.metrics import make_scorer, cohen_kappa_score, balanced_accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer
import numpy as np

In [2]:
# Load features and labels
X = np.load('../../features.npy')
y = np.load('../../labels.npy')

X.shape, y.shape

((8724, 2048), (8724,))

In [3]:
# Count the number of samples in each class
print('Number of samples in each class:')
print(np.unique(y, return_counts=True))

# Define class names
class_names = ["Few", "Many", "None"]
print('Class names:')
print(class_names)

Number of samples in each class:
(array([0, 1, 2]), array([6232,  256, 2236]))
Class names:
['Few', 'Many', 'None']


In [4]:
os.makedirs('logs', exist_ok=True)
os.makedirs('figures', exist_ok=True)

In [5]:
def custom_scorer(y_true, y_pred):
    if y_pred.ndim == 2:
        y_pred_class = np.argmax(y_pred, axis=1)
    else:
        y_pred_class = y_pred

    auc = roc_auc_score(y_true, y_pred, average='macro', multi_class='ovo')
    f1 = f1_score(y_true, y_pred_class, average='macro')
    kappa = cohen_kappa_score(y_true, y_pred_class)
    balanced_acc = balanced_accuracy_score(y_true, y_pred_class)
    
    return 0.3 * auc + 0.3 * f1 + 0.2 * kappa + 0.2 * balanced_acc

custom_score = make_scorer(custom_scorer, greater_is_better=True, needs_proba=True)



In [6]:
import cupy as cp

# Convert your data to GPU
X_gpu = cp.array(X)
y_gpu = cp.array(y)


In [7]:
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', device= "cuda", num_class=3, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

In [8]:
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=3,
    random_state=42,
    # n_jobs=6,
    scoring=custom_score
)

random_search.fit(X_gpu.get(), y_gpu.get())

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV 1/5] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=500, subsample=0.8;, score=0.924 total time=   8.4s
[CV 2/5] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=500, subsample=0.8;, score=0.935 total time=   8.1s
[CV 3/5] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=500, subsample=0.8;, score=0.970 total time=   7.6s
[CV 4/5] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=500, subsample=0.8;, score=0.969 total time=   8.5s
[CV 5/5] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.3, max_depth=5, min_child_weight=1, n_estimators=500, subsample=0.8;, score=0.972 total time=   7.2s
[CV 1/5] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=9, min_child_weight=1, n_estimators=500, subsample=0.8;, score=0.929 total time=  15.0s
[CV 2/5] END colsample

In [9]:
# Save the best model
with open('best_model.json', 'w') as f:
    json.dump(random_search.best_params_, f)

In [10]:
random_search.best_params_

{'subsample': 0.8,
 'n_estimators': 200,
 'min_child_weight': 1,
 'max_depth': 3,
 'learning_rate': 0.01,
 'gamma': 0.1,
 'colsample_bytree': 0.8}

In [13]:
# best_xgb = random_search.best_estimator_

# Load the best model
with open('best_model.json', 'r') as f:
    best_params = json.load(f)

best_xgb = xgb.XGBClassifier(objective='multi:softprob', num_class=3, random_state=42, **best_params)

# Perform cross-validation with the best model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_results = []

for fold, (train_index, val_index) in enumerate(cv.split(X, y), 1):
    print(f"Fold {fold}")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    best_xgb.fit(X_train, y_train)
    y_pred = best_xgb.predict(X_val)
    y_prob = best_xgb.predict_proba(X_val)
    
    # Calculate metrics
    accuracy, class_metrics, auc, f1, cm, avg_sensitivity, avg_specificity = calculate_metrics(y_val, y_pred, y_prob)
    
    # Calculate AUC for double dichotomy
    double_dichotomy_auc = calculate_double_dichotomy_auc(y_val, y_prob)
    
    # Log metrics for this fold
    metrics = {
        'fold': fold,
        'val_accuracy': accuracy,
        'val_auc': auc,
        'val_f1': f1,
        'avg_sensitivity': avg_sensitivity,
        'avg_specificity': avg_specificity,
        **double_dichotomy_auc,
        **{f'class_{class_names[i]}_sensitivity': metrics["sensitivity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_specificity': metrics["specificity"] for i, metrics in enumerate(class_metrics)},
        **{f'class_{class_names[i]}_f1': 2 * metrics["sensitivity"] * metrics["specificity"] / (metrics["sensitivity"] + metrics["specificity"]) for i, metrics in enumerate(class_metrics)}
    }

    print("Metrics for this fold:")
    print(metrics)
    
    custom_log(metrics, model_name=f'xgboost_optimized_{fold}', log_dir='logs')
    
    # Plot confusion matrix for this fold
    plot_confusion_matrix(cm, class_names=class_names, epoch_num=0, model_name='xgboost_optimized', fold_num=fold)
    
    fold_results.append(metrics)

# Calculate and print average results across all folds
avg_results = {key: np.mean([fold[key] for fold in fold_results if key in fold]) 
               for key in fold_results[0].keys() if key != 'fold'}

print("Average results across all folds:")
for key, value in avg_results.items():
    print(f"{key}: {value}")

# Log average results
custom_log(avg_results, model_name='xgboost_optimized_average', log_dir='logs')

Fold 1
Metrics for this fold:
{'fold': 1, 'val_accuracy': 0.9862464183381089, 'val_auc': 0.998506615811638, 'val_f1': 0.9862170246247185, 'avg_sensitivity': 0.9864085438161206, 'avg_specificity': 0.9877686561184807, 'auc_normal_vs_abnormal': 0.9984557208991289, 'auc_few_vs_many': 1.0, 'class_Few_sensitivity': 0.9927826784282278, 'class_Many_sensitivity': 1.0, 'class_None_sensitivity': 0.9664429530201343, 'class_Few_specificity': 0.9698795180722891, 'class_Many_specificity': 0.9988193624557261, 'class_None_specificity': 0.9946070878274268, 'class_Few_f1': 0.9811974647713987, 'class_Many_f1': 0.9994093325457768, 'class_None_f1': 0.9803227771171538}
Fold 2
Metrics for this fold:
{'fold': 2, 'val_accuracy': 0.9828080229226361, 'val_auc': 0.9988645535658679, 'val_f1': 0.9828590178612763, 'avg_sensitivity': 0.98719671432168, 'avg_specificity': 0.9882304851581032, 'auc_normal_vs_abnormal': 0.9988314495196533, 'auc_few_vs_many': 0.9999842759878612, 'class_Few_sensitivity': 0.9839615076182838, 