# Analyzing replicability of functional connectivity-based multivariate BWAS on the Human Connectome Project dataset

## Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
from mlxtend.evaluate import permutation_test
sns.set(rc={"figure.figsize":(4, 2)})
sns.set_style("whitegrid")

## Load HCP data

We load functional network matrices (netmats) from the HCP1200-release, as published on connectomeDB: https://db.humanconnectome.org/
Due to licensoing issues, data is not supplied with the repository, but can be downloaded from the ConnectomeDB.
See [hcp_data/readme.md](hcp_data/readme.md) for more details.

In [3]:
# HCP data can be obtained from the connectomeDB
# data is not part of this repository
subjectIDs = pd.read_csv('hcp_data/subjectIDs.txt', header=None)

netmats_pearson = pd.read_csv('hcp_data/netmats1_correlationZ.txt',
                             sep=' ',
                             header=None)
netmats_pearson['ID'] = subjectIDs[0]
netmats_pearson.set_index('ID', drop=True, inplace=True)


netmats_parcor = pd.read_csv('hcp_data/netmats2_partial-correlation.txt',
                             sep=' ',
                             header=None)
netmats_parcor['ID'] = subjectIDs[0]
netmats_parcor.set_index('ID', drop=True, inplace=True)

behavior = pd.read_csv('hcp_data/hcp1200_behavioral_data.csv')
behavior = behavior.set_index('Subject', drop=True)

# convert age to numeric
age = []
for s in behavior['Age']:
    if s == '36+':
        age.append(36)
    else:
        split = s.split(sep='-')
        age.append(np.mean((float(split[0]), float(split[1]))))

behavior['age'] = age
behavior

array(['Release', 'Acquisition', 'Gender', 'Age', '3T_Full_MR_Compl',
       'T1_Count', 'T2_Count', '3T_RS-fMRI_Count', '3T_RS-fMRI_PctCompl',
       '3T_Full_Task_fMRI', '3T_tMRI_PctCompl', 'fMRI_WM_PctCompl',
       'fMRI_Gamb_PctCompl', 'fMRI_Mot_PctCompl', 'fMRI_Lang_PctCompl',
       'fMRI_Soc_PctCompl', 'fMRI_Rel_PctCompl', 'fMRI_Emo_PctCompl',
       '3T_dMRI_Compl', '3T_dMRI_PctCompl', 'dMRI_3T_ReconVrs',
       'fMRI_3T_ReconVrs', '7T_Full_MR_Compl', '7T_RS-fMRI_Count',
       '7T_RS-fMRI_PctCompl', '7T_Full_Task_fMRI', '7T_tMRI_PctCompl',
       'fMRI_Movie_Compl', 'fMRI_Movie_PctCompl', 'fMRI_Ret_Compl',
       'fMRI_Ret_PctCompl', '7T_dMRI_Compl', '7T_dMRI_PctCompl',
       '7T_fMRI_Mov_Vrs', 'MEG_AnyData', 'MEG_FullProt_Compl',
       'MEG_HeadModel_Avail', 'MEG_CortRibn_Avail', 'MEG_Anatomy_Avail',
       'MEG_Anatomy_Compl', 'MEG_Noise_Avail', 'MEG_Noise_Compl',
       'MEG_RS_Avail', 'MEG_RS_Compl', 'MEG_WM_Avail', 'MEG_WM_Compl',
       'MEG_StoryMath_Avail', 'MEG_Sto

# Function to prepare target variable


In [4]:
def create_data(target='CogTotalComp_AgeAdj', feature_data=netmats_parcor):
    # it's a good practice to use pandas for merging, messing up subject order can be painful
    features = feature_data.columns
    df = behavior
    df = df.merge(feature_data, left_index=True, right_index=True, how='left')

    df = df.dropna(subset = [target] + features.values.tolist())
    y = df[target].values
    X = df[features].values
    return X, y

# Function implementing a single bootstrap iteration

We define a workhorse function which:
- randomly samples the discovery and the replication datasets,
- creates cross-validated estimates of predictive performance within the discovery sample
- finalizes the model by fitting it to the whole discovery sample (overfits the discovery but not the replication sample)
- use it to predict the replication sample

In [10]:
def bootstrap_workhorse(X, y, sample_size, model, random_state, shuffle_y=False):

    #create discovery and replication samples by random sampling from the whole dataset (without replacement)

    # if shuffle_y is true, a null model is created bz permuting y
    if shuffle_y:
        rng = np.random.default_rng(random_state)
        y = rng.permutation(y)

    # sample the discovery and replication sets *without replacement* (with replacement introduces spurious dependencies)
    X_discovery, X_replication, y_discovery, y_replication = train_test_split(X, y, train_size=sample_size, test_size=sample_size, shuffle=True, random_state=random_state)

    # standard 10-fold cross-validation
    cv = KFold(10)

    # below we obtain cross-validated predictions in the discovery sample
    predicted_discovery_cv = np.zeros_like(y_discovery)  # here we collect the predictions for each fold
    cor_per_fold = np.zeros(cv.n_splits)  # here we collect the predictive performance in each fold
    i = 0  # just a counter
    for train, test in cv.split(X=X_discovery, y=y_discovery):  # loop to leave one fold out
        model.fit(X=X_discovery[train], y=y_discovery[train]) # fit model to the training set
        predicted_discovery_cv[test] = model.predict(X=X_discovery[test]) # use fitted model to predict teh test set
        cor_per_fold[i] = np.corrcoef(y_discovery[test], predicted_discovery_cv[test])[0,1] # calculate performance on tne test set
        i += 1
    # calculate mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # 'finalize' model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X_discovery, y=y_discovery)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X_discovery)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y_discovery)[0, 1]

    # We use the final model to predict the replication sample
    # This is correct (no overfitting here), the final model did not see this data during training
    predicted_replication = final_model.predict(X=X_replication)
    # we obtain the out-of-sample prediction performance estimates
    r_rep = np.corrcoef(predicted_replication, y_replication)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_rep = permutation_test(predicted_replication, y_replication, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    # return results
    return r_disc_cv, r_disc_overfit, r_rep, p_disc_cv, p_disc_overfit, p_rep

All set, now we start the analysis.

# Replicability with sample sizes n=50, 100, 200, 300 and max
Here we train a few different models on 100 bootstrap samples.

We aggregate the results of our workhorse function in `n_bootstrap`=100 bootstrap cases (run in parallel).

The whole process is repeated for all sample sizes, fetaure_sets and target variables.

## Here we test age and 5 cognitive variables, including 'cognitive ability' (the main target variable in the target paper)
- age: age group of the participants
- CogTotalComp_AgeAdj: total cognitive ability
- PMAT24_A_CR, : Fluid Intelligence (Penn Progressive Matrices)
- CardSort_AgeAdj: Executive Function/Cognitive Flexibility (Dimensional Change Card Sort)
- Flanker_AgeAdj: Executive Function/Inhibition (Flanker Task)
- PicSeq_AgeAdj: Episodic Memory (Picture Sequence Memory)

# Reproducing the PCA+SVR-based model from the target paper
### Like in the target paper:
- Both PCA and SVR are done inside the cross-validation
- PCA reatains the firts k principal components that together explain 50% of the variance
- scikit-learn makes sure that PCA is only fit for the training samples
- both for the test sets (in the cross-validation) and the replication sample PCA is not re-fit, bt features are simply transformed with the already fit PCA

In [11]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set])

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_PCA_SVR.csv')

df

*****************************************************************
netmats_parcor PCA_SVR age 50
0.1845122123289261 0.18901378266057672
Replicability at alpha = 0.05 : 57.14285714285714 %
Replicability at alpha = 0.01 : 14.285714285714285 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR age 100


  values = values.astype(str)


KeyboardInterrupt: 

# Now we fit a simple Ridge regression
(no feature selection, no hyperparameter optimization)
This can be expected to perform better on low samples than SVR.

In [7]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'ridge': Ridge()
}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set])

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_Ridge.csv')

df


*****************************************************************
netmats_parcor ridge age 50
0.24233370132686197 0.2609198136325508
Replicability at alpha = 0.05 : 58.536585365853654 %
Replicability at alpha = 0.01 : 14.634146341463413 %
Replicability at alpha = 0.005 : 12.195121951219512 %
Replicability at alpha = 0.001 : 7.317073170731707 %
*****************************************************************
netmats_parcor ridge age 100
0.3323209164524509 0.34287580012326385
Replicability at alpha = 0.05 : 97.75280898876404 %
Replicability at alpha = 0.01 : 71.91011235955057 %
Replicability at alpha = 0.005 : 58.42696629213483 %
Replicability at alpha = 0.001 : 38.20224719101123 %
*****************************************************************
netmats_parcor ridge age 200
0.39528891792691084 0.4213171713707975
Replicability at alpha = 0.05 : 100.0 %
Replicability at alpha = 0.01 : 100.0 %
Replicability at alpha = 0.005 : 99.0 %
Replicability at alpha = 0.001 : 97.0 %
****************

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,netmats_parcor,ridge,age,50,,1.0,0.404661,0.629371,0.000999,0.002997
1,netmats_parcor,ridge,age,50,0.082894,1.0,0.281041,0.210789,0.000999,0.026973
2,netmats_parcor,ridge,age,50,,1.0,0.107181,0.227772,0.000999,0.216783
3,netmats_parcor,ridge,age,50,0.356733,1.0,-0.203981,0.047952,0.000999,0.909091
4,netmats_parcor,ridge,age,50,0.455145,1.0,0.385665,0.001998,0.000999,0.004995
...,...,...,...,...,...,...,...,...,...,...
5995,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.197442,1.0,0.217841,0.000999,0.000999,0.000999
5996,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.243984,1.0,0.20198,0.000999,0.000999,0.000999
5997,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.263024,1.0,0.203847,0.000999,0.000999,0.000999
5998,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.167095,1.0,0.195714,0.001998,0.000999,0.000999


# Null scenario with random target
To evaluate false positives with biased estimates

In [14]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_null_PCA_SVR.csv')

df

*****************************************************************
netmats_parcor PCA_SVR age 50
0.015718268671406327 -0.009632305254739223
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR age 100
0.02118342357629429 0.0008314530617750957
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR age 200
0.0020095024919472414 -0.0020659580620204566
Replicability at alpha = 0.05 : 12.5 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR age 300
0.010896406643841883



0.005333043157679098 -0.012089870875584273
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CogTotalComp_AgeAdj 200
-0.0017982202571123606 -0.012032126906222016
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CogTotalComp_AgeAdj 300
0.008405711567388275 -0.009813451575864182
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CogTotalComp_AgeAdj max
-0.007774668229775042 -0.002075140122022788
Replicability at alpha =



0.007738730008135625 -0.020071926806312933
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PMAT24_A_CR 100
0.013192802109739422 -0.013371222236656783
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PMAT24_A_CR 200
0.0010659245161563945 0.00011201143802545263
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PMAT24_A_CR 300
0.01700325765194698 -0.00932689468166226
Replicability at alpha = 0.05 : 0.0 %
Replicability



-0.003850369013039994 -0.00845621748920561
Replicability at alpha = 0.05 : nan %
Replicability at alpha = 0.01 : nan %
Replicability at alpha = 0.005 : nan %
Replicability at alpha = 0.001 : nan %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj 200




-0.005590010846450346 0.006889410105047694
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj 300
-0.006700452912333197 0.0020941122803921648
Replicability at alpha = 0.05 : 50.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj max
-0.001519727347733083 0.00197224488981287
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj 50
0.02868347363897688 0.0027858574600459676
Replicability at alpha = 0.05 : nan %
Repl



0.006823962094598066 0.011153896783920153
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj 200
-0.011921075972898392 0.004941067063005526
Replicability at alpha = 0.05 : 33.33333333333333 %
Replicability at alpha = 0.01 : 33.33333333333333 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj 300
0.0005607365798034183 -0.0035580932834067647
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj max
-0.002692517307069988 0.004388907323529969
Replicabil



0.0068980335642952915 0.0019460173807175662
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PicSeq_AgeAdj 200
-0.015075795730530766 0.002704084204931377
Replicability at alpha = 0.05 : nan %
Replicability at alpha = 0.01 : nan %
Replicability at alpha = 0.005 : nan %
Replicability at alpha = 0.001 : nan %
*****************************************************************
netmats_parcor PCA_SVR PicSeq_AgeAdj 300




0.01217797031561347 -0.005959227430012724
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PicSeq_AgeAdj max
-0.009451875556111277 0.002954888192621069
Replicability at alpha = 0.05 : 25.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR age 50
0.024571488786007373 -0.014451749838611997
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR age 100
-0.00044926857529493697 0.030405907184989926
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha =



0.00762089856315215 0.007601195768630712
Replicability at alpha = 0.05 : nan %
Replicability at alpha = 0.01 : nan %
Replicability at alpha = 0.005 : nan %
Replicability at alpha = 0.001 : nan %
*****************************************************************
netmats_pearson PCA_SVR PMAT24_A_CR 300




0.013826396717998823 0.00028625928905271636
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PMAT24_A_CR max
0.0012442268418254698 0.0006788357811788294
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR Flanker_AgeAdj 50
-0.006951145029068247 0.01310051805653064
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR Flanker_AgeAdj 100
0.0026547027339089086 -0.011287055848290308
Replicability at alpha = 0.05 : 0.0 %
Rep



-0.0066700364064274445 -0.0012550854807044523
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CardSort_AgeAdj 200
-0.007011765164321644 -0.001359026986178459
Replicability at alpha = 0.05 : 33.33333333333333 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CardSort_AgeAdj 300
0.014291142572627707 -0.00043760039646580204
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CardSort_AgeAdj max
7.973534426850258e-05 0.0009197645946550262
Replicability a



-0.006270139331591028 0.001866435500016127
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PicSeq_AgeAdj 200
-0.005344849015655386 0.0013732274285362875
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PicSeq_AgeAdj 300
0.010393117348453423 0.0005262481184145984
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PicSeq_AgeAdj max
-0.00039147984484469634 0.0016130406528689545
Replicability at alpha = 0.05 : 0.0 %


Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,netmats_parcor,PCA_SVR,age,50,-0.010709,0.919133,0.140633,0.577423,0.000999,0.157842
1,netmats_parcor,PCA_SVR,age,50,,0.791907,-0.074325,0.911089,0.000999,0.71029
2,netmats_parcor,PCA_SVR,age,50,-0.354673,0.872867,-0.253604,0.999001,0.000999,0.965035
3,netmats_parcor,PCA_SVR,age,50,-0.076261,0.886898,-0.079072,0.99001,0.000999,0.695305
4,netmats_parcor,PCA_SVR,age,50,-0.139239,0.887541,-0.021112,0.947053,0.000999,0.585415
...,...,...,...,...,...,...,...,...,...,...
5995,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.017149,0.334397,-0.004167,0.796204,0.000999,0.542458
5996,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,-0.090964,0.325431,-0.060089,0.99001,0.000999,0.899101
5997,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.073699,0.307215,-0.083865,0.23976,0.000999,0.973027
5998,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.039265,0.311807,-0.081005,0.305694,0.000999,0.962038


In [13]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'Ridge': Ridge()

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_null_Ridge.csv')

df

*****************************************************************
netmats_parcor Ridge age 50
-0.014348756240858624 -0.019865509971863777
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor Ridge age 100
0.011020054799004317 0.00818911468614934
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor Ridge age 200
0.014167298702132609 -0.0043645513332393575
Replicability at alpha = 0.05 : 11.11111111111111 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor Ridge age 300
-0.0021402405014

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,netmats_parcor,Ridge,age,50,-0.051893,1.0,0.033311,0.752248,0.000999,0.400599
1,netmats_parcor,Ridge,age,50,,1.0,-0.09151,0.916084,0.000999,0.755245
2,netmats_parcor,Ridge,age,50,0.00365,1.0,0.171662,0.682318,0.000999,0.121878
3,netmats_parcor,Ridge,age,50,-0.417308,1.0,0.002095,0.999001,0.000999,0.481518
4,netmats_parcor,Ridge,age,50,-0.063412,1.0,-0.016329,0.877123,0.000999,0.574426
...,...,...,...,...,...,...,...,...,...,...
5995,netmats_pearson,Ridge,PicSeq_AgeAdj,501,0.028744,1.0,-0.008928,0.328671,0.000999,0.594406
5996,netmats_pearson,Ridge,PicSeq_AgeAdj,501,-0.02452,1.0,0.004109,0.657343,0.000999,0.462537
5997,netmats_pearson,Ridge,PicSeq_AgeAdj,501,-0.116399,1.0,0.009268,0.991009,0.000999,0.435564
5998,netmats_pearson,Ridge,PicSeq_AgeAdj,501,0.074094,1.0,-0.033243,0.040959,0.000999,0.751249


*See the notebook called 'plot_results.ipynb' for the results.*

In [7]:
model = Pipeline([('pca', PCA(n_components=0.5)), ('svr', SVR())])
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

bar_data_svr = []

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_pearson)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    bar_data_svr.append(r_disc_cv)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.21 	p = 0.001 	R2 = 4.2 %
CogTotalComp_AgeAdj
r = 0.2 	p = 0.001 	R2 = 3.9 %
PMAT24_A_CR


KeyboardInterrupt: 

In [32]:
model = Ridge()
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

bar_data_ridge = []

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_parcor)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    bar_data_ridge.append(r_disc_cv)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.52 	p = 0.001 	R2 = 26.7 %
CogTotalComp_AgeAdj
r = 0.5 	p = 0.001 	R2 = 25.0 %
PMAT24_A_CR
r = 0.28 	p = 0.001 	R2 = 8.1 %
Flanker_AgeAdj
r = 0.15 	p = 0.001 	R2 = 2.1 %
CardSort_AgeAdj
r = 0.24 	p = 0.001 	R2 = 5.8 %
PicSeq_AgeAdj
r = 0.17 	p = 0.001 	R2 = 2.8 %


In [41]:
model = Ridge()
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_pearson)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.45 	p = 0.001 	R2 = 20.0 %
CogTotalComp_AgeAdj
r = 0.4 	p = 0.001 	R2 = 16.2 %
PMAT24_A_CR
r = 0.25 	p = 0.001 	R2 = 6.3 %
Flanker_AgeAdj
r = 0.16 	p = 0.001 	R2 = 2.6 %
CardSort_AgeAdj
r = 0.17 	p = 0.001 	R2 = 2.8 %
PicSeq_AgeAdj
r = 0.23 	p = 0.001 	R2 = 5.5 %


### *See the notebook called 'plot_results_FC.ipynb' for the results.*