In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from xgboost import XGBClassifier

scaler = StandardScaler()

In [11]:
data = pd.read_csv('connectome_data.csv')
data = data.drop(columns=['node_id', 'dn_position_x', 'dn_position_y', 'dn_position_z', 'dn_region', 'dn_hemisphere', 'dn_correspondence_id', 'dn_name'])
data['dn_fsname'] = [label.split('_')[0] for label in data['dn_fsname']]

In [13]:
# num_evecs = [0,2,10,20,50,80,100,120,145]
num_evecs = [50]
best_models = []

for i in num_evecs:
    print(f'Including smallest {i} eigenvectors')
    df = data.drop(columns=[f'evec{i+1}' for i in range(i,150)])
    X = df.drop(columns=['dn_fsname'])
    y = df['dn_fsname']

    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=22)

    model = XGBClassifier(random_state=22)

    param_dist = {
        "n_estimators": [50, 100, 150],
        "max_depth": [10, 12, 14, 16],
        "subsample": [0.5, 0.6, 0.7, 0.8],
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)
    random_search = RandomizedSearchCV(
        model, param_distributions=param_dist, scoring='accuracy', n_iter=1, cv=cv, random_state=22, n_jobs=-1, verbose=3
    )
    random_search.fit(X_train, y_train)

    best_model = random_search.best_estimator_

    y_pred = best_model.predict(X_test)

    train_acc = accuracy_score(y_train, best_model.predict(X_train))
    test_acc = accuracy_score(y_test, y_pred)
    print(f"Train Accuracy: {train_acc:.2f}")
    print(f"Test Accuracy: {test_acc:.2f}")

    best_models.append({
        'model': best_model,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'num_evecs': i
    })

Including smallest 50 eigenvectors
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END max_depth=14, n_estimators=150, subsample=0.5;, score=0.480 total time=29.2min
[CV 4/5] END max_depth=14, n_estimators=150, subsample=0.5;, score=0.478 total time=29.3min
[CV 5/5] END max_depth=14, n_estimators=150, subsample=0.5;, score=0.477 total time=29.3min
[CV 2/5] END max_depth=14, n_estimators=150, subsample=0.5;, score=0.479 total time=29.4min
[CV 3/5] END max_depth=14, n_estimators=150, subsample=0.5;, score=0.479 total time=29.4min
Train Accuracy: 1.00
Test Accuracy: 0.50


In [None]:
from joblib import dump, load

dump(best_models, './trained_models/multiclass_models')

In [None]:
train_acc_list = [model['train_acc'] for model in best_models]

test_acc_list = [model['test_acc'] for model in best_models]

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(num_evecs, train_acc_list, marker='o', label='Training Accuracy', color='blue')
plt.plot(num_evecs, test_acc, marker='s', label='Testing Accuracy', color='orange')

plt.xlabel('Number of Eigenvectors', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Performance vs. Number of Eigenvectors', fontsize=14)
plt.legend()

plt.show()