In [None]:
import numpy as np, pandas as pd
from pathlib import Path

X = np.load("../data/processed/X.npy")
y = np.load("../data/processed/y.npy")

print("Class counts:", np.bincount(y.astype(int)))
assert X.shape[0] == y.shape[0]

# Extract subjects
connectome_dir = Path("../data/interim/connectomes_cc200")

fnames = sorted([f.name for f in connectome_dir.glob("*.npy")])

subjects = np.array([name.split("_rois")[0] for name in fnames])

print("Loaded subjects: ", len(subjects))
print("Subject 1: ", subjects[0])

# Extract site names
sites = np.array([s.split("_")[0] for s in subjects])
unique_sites = np.unique(sites)

print("Unique sites: ", unique_sites)

Class counts: []
Loaded subjects:  884
Subject 1:  CMU_a_0050649
Unique sites:  ['CMU' 'Caltech' 'KKI' 'Leuven' 'MaxMun' 'NYU' 'OHSU' 'Olin' 'Pitt' 'SBL'
 'SDSU' 'Stanford' 'Trinity' 'UCLA' 'UM' 'USM' 'Yale']


In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from collections import defaultdict
print("HELLO")

# LOSO Evaluation
results = defaultdict(list)

for test_site in unique_sites:
    print("LEAVING OUT SITE: ", test_site)

    # Boolean masks
    train_mask = sites != test_site
    test_mask = sites == test_site

    X_train, y_train = X[train_mask], y[train_mask]
    X_test, y_test = X[test_mask], y[test_mask]

    if len(y_test) < 10:
        print(f"Skipping site {test_site} with only {len(y_test)} samples")
        continue
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=300, random_state=42)),
        ('clf', LogisticRegression(max_iter=1000, solver='lbfgs'))
    ])

    pipe.fit(X_train, y_train)

    prob = pipe.predict_proba(X_test)[:, 1]
    pred = (prob >= 0.5).astype(int)

    auc = roc_auc_score(y_test, prob)
    acc = accuracy_score(y_test, pred)

    print("AUC: ", auc)
    print("ACC: ", acc)
    print("\n", classification_report(y_test, pred))

    results['site'].append(test_site)
    results['auc'].append(auc)
    results['acc'].append(acc)

df_results = pd.DataFrame(results)
print("\nSummary\n")
print(df_results)
print("Mean AUC: ", df_results['auc'].mean())
print("Mean ACC: ", df_results['acc'].mean())

In [6]:
import joblib, pathlib

pathlib.Path("../data/processed").mkdir(parents=True, exist_ok=True)
joblib.dump(best, "../data/processed/best_baseline.joblib")

['../data/processed/best_baseline.joblib']