In [1]:
import numpy as np, pandas as pd
from pathlib import Path

X = np.load("../data/processed/X.npy")
y = np.load("../data/processed/y.npy")

print("Class counts:", np.bincount(y.astype(int)))
assert X.shape[0] == y.shape[0]

# Extract subjects
connectome_dir = Path("../data/interim/connectomes_cc200")

fnames = sorted([f.name for f in connectome_dir.glob("*.npy")])

subjects = np.load("../data/processed/subjects.npy")

print("Loaded subjects: ", len(subjects))
print("Subject 1: ", subjects[0])

# Extract site names
sites = np.load("../data/processed/sites.npy", 
                allow_pickle=True)
sites = [s.split("_")[0] for s in sites]

unique_sites = np.unique(sites)

print("Unique sites: ", unique_sites)

Class counts: [476 408]
Loaded subjects:  884
Subject 1:  CMU_a_0050649
Unique sites:  ['CALTECH' 'CMU' 'KKI' 'LEUVEN' 'MAX' 'NYU' 'OHSU' 'OLIN' 'PITT' 'SBL'
 'SDSU' 'STANFORD' 'TRINITY' 'UCLA' 'UM' 'USM' 'YALE']


In [4]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from collections import defaultdict

# Preprocess X and y


# LOSO Evaluation
results = defaultdict(list)

# ensure sites is a numpy array so boolean masks are elementwise
sites_arr = np.array(sites)

for test_site in unique_sites:
    print("LEAVING OUT SITE: ", test_site)

    # Boolean masks (elementwise comparison)
    train_mask = sites_arr != test_site
    test_mask = sites_arr == test_site

    X_train, y_train = X[train_mask], y[train_mask]
    X_train[np.isnan(X_train)] = 0

    X_test, y_test = X[test_mask], y[test_mask]
    X_test[np.isnan(X_test)] = 0 

    if len(y_test) < 10:
        print(f"Skipping site {test_site} with only {len(y_test)} samples")
        continue
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=300, random_state=42)),
        ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))
    ])

    pipe.fit(X_train, y_train)

    prob = pipe.predict_proba(X_test)[:, 1]
    pred = (prob >= 0.5).astype(int)

    auc = roc_auc_score(y_test, prob)
    acc = accuracy_score(y_test, pred)

    print("AUC: ", auc)
    print("ACC: ", acc)
    print("\n", classification_report(y_test, pred))

    results['site'].append(test_site)
    results['auc'].append(auc)
    results['acc'].append(acc)

df_results = pd.DataFrame(results)
print("\nSummary\n")
print(df_results)
print("Mean AUC: ", df_results['auc'].mean())
print("Mean ACC: ", df_results['acc'].mean())

LEAVING OUT SITE:  CALTECH
AUC:  0.6111111111111112
ACC:  0.6486486486486487

               precision    recall  f1-score   support

           0       0.67      0.56      0.61        18
           1       0.64      0.74      0.68        19

    accuracy                           0.65        37
   macro avg       0.65      0.65      0.64        37
weighted avg       0.65      0.65      0.65        37

LEAVING OUT SITE:  CMU
Skipping site CMU with only 5 samples
LEAVING OUT SITE:  KKI
AUC:  0.7407407407407407
ACC:  0.6410256410256411

               precision    recall  f1-score   support

           0       1.00      0.48      0.65        27
           1       0.46      1.00      0.63        12

    accuracy                           0.64        39
   macro avg       0.73      0.74      0.64        39
weighted avg       0.83      0.64      0.64        39

LEAVING OUT SITE:  LEUVEN
AUC:  0.6350762527233115
ACC:  0.639344262295082

               precision    recall  f1-score   support


In [6]:
import joblib, pathlib

pathlib.Path("../data/processed").mkdir(parents=True, exist_ok=True)
joblib.dump(best, "../data/processed/best_baseline.joblib")

['../data/processed/best_baseline.joblib']