# Analysis of the Human Connectome Project (funcional connectome)

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from mlconfound.stats import test_fully_confounded, test_partially_confounded
from mlconfound.plot import plot_graph

from mlxtend.evaluate import permutation_test

import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.formula.api import ols as ols_f
from scipy.stats import kurtosis, skew

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import quantile_transform

from neurocombat_sklearn import CombatModel

## Load data

In [2]:
# HCP data can be obtainedc from the connectomeDB with special license
# data is not part of this repository
subjectIDs = pd.read_csv('../data_in/hcp/subjectIDs.txt', header=None)

netmats_pearson = pd.read_csv('../data_in/hcp/netmats1_correlationZ.txt',
                             sep=' ',
                             header=None)
netmats_pearson['ID'] = subjectIDs[0]
netmats_pearson.set_index('ID', drop=True, inplace=True)


netmats_parcor = pd.read_csv('../data_in/hcp/netmats2_partial-correlation.txt',
                             sep=' ',
                             header=None)
netmats_parcor['ID'] = subjectIDs[0]
netmats_parcor.set_index('ID', drop=True, inplace=True)

behavior = pd.read_csv('../data_in/hcp/hcp1200_behavioral_data.csv')
behavior = behavior.set_index('Subject', drop=True)

# convert age to numeric
age = []
for s in behavior['Age']:
    if s == '36+':
        age.append(36)
    else:
        split = s.split(sep='-')
        age.append(np.mean((float(split[0]), float(split[1]))))

behavior['age'] = age
behavior


FileNotFoundError: [Errno 2] No such file or directory: '../data_in/hcp/subjectIDs.txt'

### Select target variable

In [None]:
##########################################################
# change these
target = 'PMAT24_A_CR' # fluid intelligence
feature_data = netmats_parcor
##########################################################

sns.histplot(behavior[target], color='gray')
plt.savefig('../data_out/fig/hcp_iq_nonnorm_hist.pdf')

# it's a good practice to use pandas for merging, messing up subject order can be painful
features = feature_data.columns
df = behavior
df = df.merge(feature_data, left_index=True, right_index=True, how='left')
df = df.dropna(subset = [target] + features.values.tolist())
y = df[target].values
X = df[features].values

### Normalize target

In [None]:
rng = np.random.default_rng(42)
y_trf = quantile_transform(np.array([y+rng.uniform(0,1,len(y))-0.5]).T, output_distribution='normal', n_quantiles=1000).flatten()

sns.histplot(y_trf, color='gray')
plt.savefig('../data_out/fig/hcp_iq_quanttrf_hist.pdf')

kurtosis(y_trf), skew(y_trf)
y=y_trf


# Machine Learning on raw data

In [None]:
model = Pipeline([
    ('varthr', VarianceThreshold(0)),   # omit zero variance columns (diagonal)
    #('fsel', SelectKBest(f_regression)),
    ('model', Ridge(max_iter=100000))])

p_grid = {#'fsel__k': [500, 1000, 2000],
          'model__alpha': [0.000001, 0.01, 0.1, 1, 10, 100, 1000000]}


In [None]:
# nested cv
outer_cv = KFold(10)
inner_cv = KFold(10)                                    
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv,
                   scoring="neg_mean_squared_error", verbose=True, return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):

    clf.fit(X[train], y[train])
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X[test], y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X[test])
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X[test], y[test])
    i = i+1

### Results (raw data)

In [None]:
print("*** Score on mean as model:\t" + str(-mean_squared_error(np.repeat(y.mean(), len(y)), y)))
print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))
print("Explained Variance: " +  str( 1- nested_scores_test.mean()/-mean_squared_error(np.repeat(y.mean(), len(y)), y) ))
print("Correlation: " + str(np.corrcoef(y, predicted)[0,1]))

plt.figure(figsize=(5,2))
sns.regplot(x=y, y=predicted, scatter=False, color='gray')
sns.scatterplot(x=y, y=predicted, hue=df.age, palette=sns.color_palette("coolwarm", as_cmap=True), alpha=0.4)
plt.savefig('../data_out/fig/hcp_age_raw_regplot.pdf')
plt.show()

plt.figure(figsize=(5,2))
sns.regplot(x=y, y=predicted, scatter=False, color='gray')
sns.scatterplot(x=y, y=predicted, hue=df.Acquisition.astype("category").cat.codes.values,
                palette=sns.color_palette("coolwarm", as_cmap=True), alpha=0.4)
plt.savefig('../data_out/fig/hcp_acq_raw_regplot.pdf')


## Confound testing: age groups (raw)

In [None]:
plot_graph(test_partially_confounded(y, predicted, df['age'],
                                     random_state=42), outfile_base='../data_out/fig/hcp_age_raw_partial')

In [None]:
plot_graph(test_fully_confounded(y, predicted, df['age'],
                                random_state=42), outfile_base='../data_out/fig/hcp_age_raw_full')

## Confound testing: acquisition batch (raw)

In [None]:
plot_graph(test_partially_confounded(y, predicted, pd.Categorical(df['Acquisition'].values).codes, cat_c=True,
                                     random_state=42), outfile_base='../data_out/fig/hcp_acq_raw_partial')

In [None]:
plot_graph(test_fully_confounded(y, predicted, pd.Categorical(df['Acquisition'].values).codes, cat_c=True,
                                random_state=42), outfile_base='../data_out/fig/hcp_acq_raw_full')

# Regress out confounder from features

In [None]:
# regress-out age from connectivity
X_adj = np.zeros_like(X)
for i in range(X.shape[1]):
    OLS_model = OLS(X[:,i], sm.add_constant(df.age)).fit()  # training the model
    X_adj[:, i] = OLS_model.resid.values

In [None]:
# nested cv
outer_cv = KFold(10)
inner_cv = KFold(10)                                    
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv,
                   scoring="neg_mean_squared_error", verbose=True, return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X_adj))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X_adj))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X_adj, y):

    clf.fit(X_adj[train], y[train])
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_adj[test], y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_adj[test])
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_adj[test], y[test])
    i = i+1

## Results (feature regression)

In [None]:
print("*** Score on mean as model:\t" + str(-mean_squared_error(np.repeat(y.mean(), len(y)), y)))
print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))
print("Explained Variance: " +  str( 1- nested_scores_test.mean()/-mean_squared_error(np.repeat(y.mean(), len(y)), y) ))
print("Correlation: " + str(np.corrcoef(y, predicted)[0,1]))

plt.figure(figsize=(5,2))
sns.regplot(x=y, y=predicted, scatter=False, color='gray')
sns.scatterplot(x=y, y=predicted, hue=df.age, palette=sns.color_palette("coolwarm", as_cmap=True), alpha=0.4)
plt.savefig('../data_out/fig/hcp_age_reg_regplot.pdf')

#### Confound test (feature regression)

In [None]:
plot_graph(test_partially_confounded(y, predicted, df['age'], random_state=42),
          outfile_base='../data_out/fig/hcp_age_reg_partial')

In [None]:
plot_graph(test_fully_confounded(y, predicted, df['age'], random_state=42),
           outfile_base='../data_out/fig/hcp_age_reg_full')

## Regress out acquisition batch

In [None]:
# regress-out acquisition from connectivity
X_adj = np.zeros_like(X)
for i in range(X.shape[1]):
    tmp = pd.DataFrame({
        'x': df.Acquisition.values,
        'y': X[:,i]
    })
    OLS_model = ols_f("y ~ C(x)", tmp).fit()  # training the model
    X_adj[:, i] = OLS_model.resid.values

In [None]:
# nested cv
outer_cv = KFold(10)
inner_cv = KFold(10)                                    
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv,
                   scoring="neg_mean_squared_error", verbose=True, return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X_adj))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X_adj))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X_adj, y):

    
    
    clf.fit(X_adj[train], y[train])
    
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_adj[test], y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_adj[test])
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_adj[test], y[test])
    i = i+1

## Results (acquisition batch regressed out)

In [None]:
print("*** Score on mean as model:\t" + str(-mean_squared_error(np.repeat(y.mean(), len(y)), y)))
print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))
print("Explained Variance: " +  str( 1- nested_scores_test.mean()/-mean_squared_error(np.repeat(y.mean(), len(y)), y) ))
print("Correlation: " + str(np.corrcoef(y, predicted)[0,1]))
        
plt.figure(figsize=(5,2))
sns.regplot(x=y, y=predicted, scatter=False, color='gray')
sns.scatterplot(x=y, y=predicted, hue=df.Acquisition.astype("category").cat.codes.values,
                palette=sns.color_palette("coolwarm", as_cmap=True), alpha=0.4)
plt.savefig('../data_out/fig/hcp_acq_reg_regplot.pdf')

In [None]:
plot_graph(test_partially_confounded(y, predicted, pd.Categorical(df['Acquisition'].values).codes, cat_c=True,
                                    random_state=42), outfile_base='../data_out/fig/hcp_acq_reg_partial')

In [None]:
plot_graph(test_fully_confounded(y, predicted, pd.Categorical(df['Acquisition'].values).codes, cat_c=True,
                                random_state=42), outfile_base='../data_out/fig/hcp_acq_reg_full')

# COMBAT age group

In [None]:
# nested cv
outer_cv = KFold(10)
inner_cv = KFold(10)                                    
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv,
                   scoring="neg_mean_squared_error", verbose=True, return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X_adj))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X_adj))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):

    comb = CombatModel()
    X_train_combat = comb.fit_transform(X[:,np.sum(X,0)!=0][train],
                                   np.array([df.age.astype("category").cat.codes.values[train]]).transpose()
                                  )
    
    clf.fit(X_train_combat, y[train])
    
    X_test_combat = comb.transform(X[:,np.sum(X,0)!=0][test],
                                   np.array([df.age.astype("category").cat.codes.values[test]]).transpose()
                                  )
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_test_combat, y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_test_combat)
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_test_combat, y[test])
    i = i+1

## Results (combat age)

In [None]:
print("*** Score on mean as model:\t" + str(-mean_squared_error(np.repeat(y.mean(), len(y)), y)))
print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))
print("Explained Variance: " +  str( 1- nested_scores_test.mean()/-mean_squared_error(np.repeat(y.mean(), len(y)), y) ))
print("Correlation: " + str(np.corrcoef(y, predicted)[0,1]))
  
plt.figure(figsize=(5,2))
sns.regplot(x=y, y=predicted, scatter=False, color='gray')
sns.scatterplot(x=y, y=predicted, hue=df.age,
                palette=sns.color_palette("coolwarm", as_cmap=True), alpha=0.4)
plt.savefig('../data_out/fig/hcp_age_comb_regplot.pdf')

In [None]:
plot_graph(test_partially_confounded(y, predicted, df['age'], random_state=42),
          outfile_base='../data_out/fig/hcp_age_comb_partial')

In [None]:
plot_graph(test_fully_confounded(y, predicted, df['age'], random_state=42),
          outfile_base='../data_out/fig/hcp_age_comb_full')

# COMBAT acquisition batch

In [None]:
# nested cv
outer_cv = KFold(10)
inner_cv = KFold(10)                                    
clf = GridSearchCV(estimator=model, param_grid=p_grid, cv=inner_cv,
                   scoring="neg_mean_squared_error", verbose=True, return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X_adj))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X_adj))   
                                    
print("model\tinner_cv mean score\touter vc score")
i=0
for train, test in outer_cv.split(X, y):

    comb = CombatModel()
    X_train_combat = comb.fit_transform(X[:,np.sum(X,0)!=0][train],
                                   np.array([df.Acquisition.astype("category").cat.codes.values[train]]).transpose()
                                  )
    
    clf.fit(X_train_combat, y[train])
    
    X_test_combat = comb.transform(X[:,np.sum(X,0)!=0][test],
                                   np.array([df.Acquisition.astype("category").cat.codes.values[test]]).transpose()
                                  )
                                    
    print('cv:', i, str(clf.best_params_) + " " + str(clf.best_score_) + " " + str(clf.score(X_test_combat, y[test])))
                                    
    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)
    
    predicted[test] = clf.predict(X_test_combat)
                                    
    nested_scores_train[i] = clf.best_score_
    nested_scores_test[i] = clf.score(X_test_combat, y[test])
    i = i+1

## Results (combat acquisition batch)

In [None]:
print("*** Score on mean as model:\t" + str(-mean_squared_error(np.repeat(y.mean(), len(y)), y)))
print("** Mean score in the inner crossvaludation (inner_cv):\t" + str(nested_scores_train.mean()))
print("** Mean Nested Crossvalidation Score (outer_cv):\t" + str(nested_scores_test.mean()))
print("Explained Variance: " +  str( 1- nested_scores_test.mean()/-mean_squared_error(np.repeat(y.mean(), len(y)), y) ))
print("Correlation: " + str(np.corrcoef(y, predicted)[0,1]))
 
plt.figure(figsize=(5,2))

sns.regplot(x=y, y=predicted, scatter=False, color='gray')
sns.scatterplot(x=y, y=predicted, hue=df.Acquisition.astype("category").cat.codes.values,
                palette=sns.color_palette("coolwarm", as_cmap=True), alpha=0.4)
plt.savefig('../data_out/fig/hcp_acq_comb_regplot.pdf')

In [None]:
plot_graph(test_partially_confounded(y, predicted, pd.Categorical(df['Acquisition'].values).codes, cat_c=True,
                                    random_state=42), outfile_base='../data_out/fig/hcp_acq_comb_partial')

In [None]:
plot_graph(test_fully_confounded(y, predicted, pd.Categorical(df['Acquisition'].values).codes, cat_c=True,
                                random_state=42), outfile_base='../data_out/fig/hcp_acq_comb_full')

In [None]:
permutation_test(y, df.age,
                 func=lambda x, y: np.corrcoef(x, y)[1][0]**2,
                 method='approximate',
                 num_rounds=10000,
                 seed=42)

In [None]:
def workhorse(x, y):
    df = pd.DataFrame({
        'x': x,
        'y': y
    })
    fit = ols_f('y ~ C(x)', data=df).fit()
    return fit.rsquared

permutation_test(pd.Categorical(df['Acquisition'].values).codes, y,
                 func=workhorse,
                 method='approximate',
                 num_rounds=10000,
                 seed=42)

In [None]:
0.001