In [1]:
# allow imports from parent directory
import sys 
sys.path.append("../")

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier
from nilearn.connectome import ConnectivityMeasure
from sklearn.model_selection import train_test_split

from src.dataset import _get_filepaths, _load_timeseries

In [2]:
seed = 12345
num_runs = 5
test_prop = 0.2

alpha = 1.
measure = "correlation" # "partial correlation"  "precision"
top_percent = [5, 10, 25, 50, 80, 100]
max_timepoint = [50, 100, 200, 600, 1200]

results = []
for dataset, fpaths in dict(hcp=_get_filepaths(dataset="hcp", data_dir="../data"),
                            ukb=_get_filepaths(dataset="ukb", data_dir="../data")).items():
    for p in top_percent:
        for t in max_timepoint:
            if (dataset == "ukb") and (t > 600):
                break
            for run in range(1, num_runs + 1):
                model = RidgeClassifier(alpha=alpha, class_weight="balanced", solver="auto")
                conn = ConnectivityMeasure(kind=measure, vectorize=False, discard_diagonal=False)
                X, y = [], []
                for fpath in fpaths:    
                    _X, _y = _load_timeseries(fpath, zscore=True)
                    fcm = conn.fit_transform([_X[:, :t].T])[0]
                    np.fill_diagonal(fcm, 0.)
                    if p < 100.:
                        threshold = np.percentile(fcm.flatten(), 100. - p)
                        fcm[fcm < threshold] = 0.
                    X += [fcm[np.tril_indices(fcm.shape[-1])]]
                    y += [_y]
                X = np.array(X)
                y = np.array(y)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_prop, random_state=seed + run)
                model.fit(X_train, y_train)
                acc = model.score(X_test, y_test) 
                results += [[dataset, p, t, run, acc]]
                print("dataset: {} | thresh: {:3d} | time: {:4d} | run: {:2d} | acc: {:04.1f}%".format(dataset, p, t, run, acc * 100))

df = pd.DataFrame(results)
df.columns = ["dataset", "thresh", "max_time", "run", "acc"]
df.groupby(["dataset", "thresh", "max_time"], as_index=False).agg({"acc": ["mean", "std"]})

dataset: hcp | thresh:   0 | time:   50 | run:  1 | acc: 61.7%
dataset: hcp | thresh:   0 | time:   50 | run:  2 | acc: 56.7%
dataset: hcp | thresh:   0 | time:   50 | run:  3 | acc: 66.7%
dataset: hcp | thresh:   0 | time:   50 | run:  4 | acc: 61.7%
dataset: hcp | thresh:   0 | time:   50 | run:  5 | acc: 63.3%
dataset: hcp | thresh:   0 | time:  100 | run:  1 | acc: 80.0%
dataset: hcp | thresh:   0 | time:  100 | run:  2 | acc: 66.7%
dataset: hcp | thresh:   0 | time:  100 | run:  3 | acc: 73.3%
dataset: hcp | thresh:   0 | time:  100 | run:  4 | acc: 78.3%
dataset: hcp | thresh:   0 | time:  100 | run:  5 | acc: 70.0%
dataset: hcp | thresh:   0 | time:  200 | run:  1 | acc: 80.0%
dataset: hcp | thresh:   0 | time:  200 | run:  2 | acc: 78.3%
dataset: hcp | thresh:   0 | time:  200 | run:  3 | acc: 76.7%
dataset: hcp | thresh:   0 | time:  200 | run:  4 | acc: 80.0%
dataset: hcp | thresh:   0 | time:  200 | run:  5 | acc: 68.3%
dataset: hcp | thresh:   0 | time:  600 | run:  1 | acc

KeyboardInterrupt: 