
# Analyzing replicability of functional connectivity-based multivariate BWAS on the Human Connectome Project dataset

This notebook contains the main analyses of the Matters Arising, with 6 example phenotypes.
For a comprehensive analysis of 52 HCP phenotypes, see `multivariate_BWAS_replicability_analysis_FC_extensive.ipynb`.
For an analysis with finer sample size resolution, see `multivariate_BWAS_replicability_analysis_FC_hires.ipynb`.
For an analysis with cortical thickness data, see `multivariate_BWAS_replicability_analysis_CT.ipynb`.

## Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
from mlxtend.evaluate import permutation_test
sns.set(rc={"figure.figsize":(4, 2)})
sns.set_style("whitegrid")

## Load HCP data

We load functional network matrices (netmats) from the HCP1200-release, as published on connectomeDB: https://db.humanconnectome.org/
Due to licensing issues, data is not supplied with the repository, but can be downloaded from the ConnectomeDB or via `get_data.ipynb` (requires credentials).
See [readme.md](readme.md) for more details.

In [14]:
# HCP data can be obtained from the connectomeDB
# data is not part of this repository
subjectIDs = pd.read_csv('hcp_data/subjectIDs.txt', header=None)

netmats_pearson = pd.read_csv('hcp_data/netmats1_correlationZ.txt',
                             sep=' ',
                             header=None)
netmats_pearson['ID'] = subjectIDs[0]
netmats_pearson.set_index('ID', drop=True, inplace=True)


netmats_parcor = pd.read_csv('hcp_data/netmats2_partial-correlation.txt',
                             sep=' ',
                             header=None)
netmats_parcor['ID'] = subjectIDs[0]
netmats_parcor.set_index('ID', drop=True, inplace=True)

behavior = pd.read_csv('hcp_data/hcp1200_behavioral_data.csv')
behavior = behavior.set_index('Subject', drop=True)

# convert age to numeric
age = []
for s in behavior['Age']:
    if s == '36+':
        age.append(36)
    else:
        split = s.split(sep='-')
        age.append(np.mean((float(split[0]), float(split[1]))))

behavior['age'] = age
behavior.describe()

Unnamed: 0,T1_Count,T2_Count,3T_RS-fMRI_Count,3T_RS-fMRI_PctCompl,3T_tMRI_PctCompl,fMRI_WM_PctCompl,fMRI_Gamb_PctCompl,fMRI_Mot_PctCompl,fMRI_Lang_PctCompl,fMRI_Soc_PctCompl,...,Odor_Unadj,Odor_AgeAdj,PainIntens_RawScore,PainInterf_Tscore,Taste_Unadj,Taste_AgeAdj,Mars_Log_Score,Mars_Errs,Mars_Final,age
count,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,...,1204.0,1204.0,1201.0,1205.0,1200.0,1200.0,1198.0,1195.0,1195.0,1206.0
mean,1.478441,1.400498,3.50995,87.213267,88.219569,89.718076,89.852736,89.631675,87.027114,87.039801,...,110.421321,97.7275,1.449625,45.847718,95.166983,93.998533,1.845467,0.58159,1.822251,28.904229
std,0.635688,0.628216,1.215181,31.027886,29.942161,30.384864,30.163559,30.492092,33.566043,33.570248,...,9.107963,11.273251,1.783069,7.679288,14.583412,14.837851,0.541393,0.973172,0.542893,3.570475
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.74,59.86,0.0,38.6,56.35,59.5,1.56,0.0,1.08,23.5
25%,1.0,1.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,101.12,87.11,0.0,38.6,84.07,83.22,1.8,0.0,1.76,28.0
50%,2.0,1.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,108.79,98.04,1.0,45.9,95.36,94.97,1.8,0.0,1.8,28.0
75%,2.0,2.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,122.25,110.45,2.0,52.2,105.57,102.92,1.88,1.0,1.84,33.0
max,2.0,2.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,122.25,111.41,10.0,75.3,134.65,131.38,15.0,17.0,15.0,36.0


# Helper function to prepare target variable


In [5]:
def create_data(target='CogTotalComp_AgeAdj', feature_data=netmats_parcor):
    # it's a good practice to use pandas for merging, messing up subject order can be painful
    features = feature_data.columns
    df = behavior
    df = df.merge(feature_data, left_index=True, right_index=True, how='left')

    df = df.dropna(subset = [target] + features.values.tolist())
    y = df[target].values
    X = df[features].values
    return X, y

# Helper function implementing a single bootstrap iteration

We define a workhorse function which:
- randomly samples the discovery and the replication datasets,
- creates cross-validated estimates of predictive performance within the discovery sample
- finalizes the model by fitting it to the whole discovery sample (overfits the discovery but not the replication sample)
- use it to predict the replication sample

In [6]:
def corr(X, Y):
    # just a small wrapper function (pandas correlation is silent in "unlucky" bootstraps with constant values)
    return pd.Series(X).corr( pd.Series(Y))

def bootstrap_workhorse(X, y, sample_size, model, random_state, shuffle_y=False):

    #create discovery and replication samples by random sampling from the whole dataset (without replacement)

    # if shuffle_y is true, a null model is created bz permuting y
    if shuffle_y:
        rng = np.random.default_rng(random_state)
        y = rng.permutation(y)

    # sample the discovery and replication sets *without replacement* (with replacement introduces spurious dependencies)
    X_discovery, X_replication, y_discovery, y_replication = train_test_split(X, y, train_size=sample_size, test_size=sample_size, shuffle=True, random_state=random_state)

    # standard 10-fold cross-validation
    cv = KFold(10)

    # below we obtain cross-validated predictions in the discovery sample
    predicted_discovery_cv = np.zeros_like(y_discovery)  # here we collect the predictions for each fold
    cor_per_fold = np.zeros(cv.n_splits)  # here we collect the predictive performance in each fold
    i = 0  # just a counter
    for train, test in cv.split(X=X_discovery, y=y_discovery):  # loop to leave one fold out
        model.fit(X=X_discovery[train], y=y_discovery[train]) # fit model to the training set
        predicted_discovery_cv[test] = model.predict(X=X_discovery[test]) # use fitted model to predict teh test set
        cor_per_fold[i] = corr(y_discovery[test], predicted_discovery_cv[test]) # calculate performance on tne test set
        i += 1
    # calculate mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # 'finalize' model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X_discovery, y=y_discovery)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X_discovery)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = corr(predicted_discovery_overfit, y_discovery)

    # We use the final model to predict the replication sample
    # This is correct (no overfitting here), the final model did not see this data during training
    predicted_replication = final_model.predict(X=X_replication)
    # we obtain the out-of-sample prediction performance estimates
    r_rep = corr(predicted_replication, y_replication)

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: corr(x, y),seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: corr(x, y),seed=random_state)
    p_rep = permutation_test(predicted_replication, y_replication, method='approximate', num_rounds=1000, func=lambda x, y: corr(x, y),seed=random_state)
    # return results
    return r_disc_cv, r_disc_overfit, r_rep, p_disc_cv, p_disc_overfit, p_rep

All set, now we start the analysis.

# Replicability with sample sizes n=50, 100, 200, 300 and max
Here we train a few different models on 100 bootstrap samples.

We aggregate the results of our workhorse function in `n_bootstrap`=100 bootstrap cases (run in parallel).

The whole process is repeated for all sample sizes, fetaure_sets and target variables.

## Here we test age and 5 cognitive variables, including 'cognitive ability' (the main target variable in the target paper)
- age: age group of the participants
- CogTotalComp_AgeAdj: total cognitive ability
- PMAT24_A_CR, : Fluid Intelligence (Penn Progressive Matrices)
- CardSort_AgeAdj: Executive Function/Cognitive Flexibility (Dimensional Change Card Sort)
- Flanker_AgeAdj: Executive Function/Inhibition (Flanker Task)
- PicSeq_AgeAdj: Episodic Memory (Picture Sequence Memory)

# Reproducing the PCA+SVR-based model from the target paper
- Both PCA and SVR are done inside the cross-validation
- PCA reatains the firts k principal components that together explain 50% of the variance
- scikit-learn makes sure that PCA is only fit for the training samples
- both for the test sets (in the cross-validation) and the replication sample PCA is not re-fit, bt features are simply transformed with the already fit PCA

In [7]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]);

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_PCA_SVR.csv')

df

*****************************************************************
netmats_parcor PCA_SVR age 50
r discovery (with cv) : 0.1845122123289261 r replication: 0.18901378266057672
Replicability at alpha = 0.05 : 57.14285714285714 %
Replicability at alpha = 0.01 : 14.285714285714285 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR age 100
r discovery (with cv) : 0.24729849550421504 r replication: 0.27347377929445277
Replicability at alpha = 0.05 : 89.55223880597015 %
Replicability at alpha = 0.01 : 40.298507462686565 %
Replicability at alpha = 0.005 : 31.343283582089555 %
Replicability at alpha = 0.001 : 11.940298507462686 %
*****************************************************************
netmats_parcor PCA_SVR age 200
r discovery (with cv) : 0.34201730090591836 r replication: 0.3617507913309844
Replicability at alpha = 0.05 : 100.0 %
Replicability at alpha = 0.01 : 98.9795



r discovery (with cv) : 0.1900581122850625 r replication: 0.21262859874092147
Replicability at alpha = 0.05 : 90.9090909090909 %
Replicability at alpha = 0.01 : 45.45454545454545 %
Replicability at alpha = 0.005 : 18.181818181818183 %
Replicability at alpha = 0.001 : 9.090909090909092 %
*****************************************************************
netmats_parcor PCA_SVR CogTotalComp_AgeAdj 200
r discovery (with cv) : 0.2623738461298858 r replication: 0.2713589974517627
Replicability at alpha = 0.05 : 100.0 %
Replicability at alpha = 0.01 : 61.29032258064516 %
Replicability at alpha = 0.005 : 53.2258064516129 %
Replicability at alpha = 0.001 : 27.419354838709676 %
*****************************************************************
netmats_parcor PCA_SVR CogTotalComp_AgeAdj 300
r discovery (with cv) : 0.30318564055635727 r replication: 0.3101363215395622
Replicability at alpha = 0.05 : 100.0 %
Replicability at alpha = 0.01 : 94.73684210526315 %
Replicability at alpha = 0.005 : 94.73684



r discovery (with cv) : 0.038605045902350546 r replication: 0.04446606264674273
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj 200
r discovery (with cv) : 0.04725917071923175 r replication: 0.058335337528020045
Replicability at alpha = 0.05 : 33.33333333333333 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj 300
r discovery (with cv) : 0.0731284650997897 r replication: 0.07671340073350894
Replicability at alpha = 0.05 : 26.666666666666668 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*******************************************************



r discovery (with cv) : 0.077594631428072 r replication: 0.08296991306991347
Replicability at alpha = 0.05 : 33.33333333333333 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj 200
r discovery (with cv) : 0.09773349695774902 r replication: 0.10256141335871113
Replicability at alpha = 0.05 : 35.294117647058826 %
Replicability at alpha = 0.01 : 5.88235294117647 %
Replicability at alpha = 0.005 : 5.88235294117647 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj 300
r discovery (with cv) : 0.12290527214628583 r replication: 0.12397123226254758
Replicability at alpha = 0.05 : 55.26315789473685 %
Replicability at alpha = 0.01 : 15.789473684210526 %
Replicability at alpha = 0.005 : 5.263157894736842 %
Replicability at alpha = 0



r discovery (with cv) : 0.06510214673673392 r replication: 0.0799216956321564
Replicability at alpha = 0.05 : 10.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR age 200
r discovery (with cv) : 0.1114303048333667 r replication: 0.13986470581995133
Replicability at alpha = 0.05 : 64.70588235294117 %
Replicability at alpha = 0.01 : 26.47058823529412 %
Replicability at alpha = 0.005 : 11.76470588235294 %
Replicability at alpha = 0.001 : 2.941176470588235 %
*****************************************************************
netmats_pearson PCA_SVR age 300
r discovery (with cv) : 0.15601435339697564 r replication: 0.1581085452863742
Replicability at alpha = 0.05 : 89.1891891891892 %
Replicability at alpha = 0.01 : 40.54054054054054 %
Replicability at alpha = 0.005 : 31.08108108108108 %
Replicability at alpha = 0.001 : 12.1621621621621



r discovery (with cv) : 0.042544874768568774 r replication: 0.0634981336102277
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CogTotalComp_AgeAdj 200
r discovery (with cv) : 0.08013945386049384 r replication: 0.10612941902239692
Replicability at alpha = 0.05 : 25.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CogTotalComp_AgeAdj 300
r discovery (with cv) : 0.12371681381416566 r replication: 0.13372831832326432
Replicability at alpha = 0.05 : 62.16216216216216 %
Replicability at alpha = 0.01 : 18.91891891891892 %
Replicability at alpha = 0.005 : 10.81081081081081 %
Replicability at alpha = 0.001 : 2.7027027027027026 %
***************



r discovery (with cv) : 0.06624026266001 r replication: 0.055111549981764464
Replicability at alpha = 0.05 : 60.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CardSort_AgeAdj 200
r discovery (with cv) : 0.07535830274189508 r replication: 0.07325950038420447
Replicability at alpha = 0.05 : 45.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CardSort_AgeAdj 300
r discovery (with cv) : 0.08376531584402025 r replication: 0.09085835147403377
Replicability at alpha = 0.05 : 38.46153846153847 %
Replicability at alpha = 0.01 : 3.8461538461538463 %
Replicability at alpha = 0.005 : 3.8461538461538463 %
Replicability at alpha = 0.001 : 0.0 %
*************************************



r discovery (with cv) : 0.04775907698381003 r replication: 0.061475454821072
Replicability at alpha = 0.05 : 20.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PicSeq_AgeAdj 200
r discovery (with cv) : 0.054746573017653455 r replication: 0.06749833856170856
Replicability at alpha = 0.05 : 42.857142857142854 %
Replicability at alpha = 0.01 : 14.285714285714285 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PicSeq_AgeAdj 300
r discovery (with cv) : 0.0699102995017721 r replication: 0.07458636299063133
Replicability at alpha = 0.05 : 41.66666666666667 %
Replicability at alpha = 0.01 : 8.333333333333332 %
Replicability at alpha = 0.005 : 8.333333333333332 %
Replicability at alpha = 0.001 : 0.0 %
**************

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,netmats_parcor,PCA_SVR,age,50,,0.774355,0.204078,0.997003,0.000999,0.091908
1,netmats_parcor,PCA_SVR,age,50,0.153087,0.913355,0.360970,0.275724,0.000999,0.001998
2,netmats_parcor,PCA_SVR,age,50,,0.841646,0.265158,0.962038,0.000999,0.025974
3,netmats_parcor,PCA_SVR,age,50,0.272270,0.899457,-0.206159,0.374625,0.000999,0.923077
4,netmats_parcor,PCA_SVR,age,50,0.349430,0.892774,0.282873,0.196803,0.000999,0.026973
...,...,...,...,...,...,...,...,...,...,...
5995,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.170381,0.360809,0.062856,0.004995,0.000999,0.085914
5996,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.095436,0.352693,0.142834,0.085914,0.000999,0.000999
5997,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.098129,0.373386,0.163908,0.028971,0.000999,0.000999
5998,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.128420,0.376163,0.109268,0.065934,0.000999,0.012987


# Now we fit a simple Ridge regression
(no feature selection, no hyperparameter optimization)
Based on some previous studies, this can be expected to perform better than SVR (especially with lower sample sizes). See the paper for details.

In [8]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'ridge': Ridge()
}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set])

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_Ridge.csv')

df


*****************************************************************
netmats_parcor ridge age 50
r discovery (with cv) : 0.24233370132686197 r replication: 0.2609198136325508
Replicability at alpha = 0.05 : 58.536585365853654 %
Replicability at alpha = 0.01 : 14.634146341463413 %
Replicability at alpha = 0.005 : 12.195121951219512 %
Replicability at alpha = 0.001 : 7.317073170731707 %
*****************************************************************
netmats_parcor ridge age 100
r discovery (with cv) : 0.3323209164524508 r replication: 0.34287580012326385
Replicability at alpha = 0.05 : 97.75280898876404 %
Replicability at alpha = 0.01 : 71.91011235955057 %
Replicability at alpha = 0.005 : 58.42696629213483 %
Replicability at alpha = 0.001 : 38.20224719101123 %
*****************************************************************
netmats_parcor ridge age 200
r discovery (with cv) : 0.39528891792691084 r replication: 0.42131717137079755
Replicability at alpha = 0.05 : 100.0 %
Replicability at a

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,netmats_parcor,ridge,age,50,,1.0,0.404661,0.629371,0.000999,0.002997
1,netmats_parcor,ridge,age,50,0.082894,1.0,0.281041,0.210789,0.000999,0.026973
2,netmats_parcor,ridge,age,50,,1.0,0.107181,0.227772,0.000999,0.216783
3,netmats_parcor,ridge,age,50,0.356733,1.0,-0.203981,0.047952,0.000999,0.909091
4,netmats_parcor,ridge,age,50,0.455145,1.0,0.385665,0.001998,0.000999,0.004995
...,...,...,...,...,...,...,...,...,...,...
5995,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.197442,1.0,0.217841,0.000999,0.000999,0.000999
5996,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.243984,1.0,0.201980,0.000999,0.000999,0.000999
5997,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.263024,1.0,0.203847,0.000999,0.000999,0.000999
5998,netmats_pearson,ridge,PicSeq_AgeAdj,501,0.167095,1.0,0.195714,0.001998,0.000999,0.000999


# Null scenario with random target
To evaluate false positives with biased estimates (Figure 1d).

First for PCA+SVR.

In [9]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_null_PCA_SVR.csv')

df

*****************************************************************
netmats_parcor PCA_SVR age 50
r discovery (with cv) : 0.015718268671406493 r replication: -0.009632305254739317
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR age 100
r discovery (with cv) : 0.021183423576294325 r replication: 0.000831453061775039
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR age 200
r discovery (with cv) : 0.0020095024919469773 r replication: -0.0020659580620204015
Replicability at alpha = 0.05 : 12.5 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %




r discovery (with cv) : 0.005333043157679225 r replication: -0.01208987087558425
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CogTotalComp_AgeAdj 200
r discovery (with cv) : -0.0017982202571123812 r replication: -0.012032126906222112
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CogTotalComp_AgeAdj 300
r discovery (with cv) : 0.008405711567388308 r replication: -0.009813451575864128
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************



r discovery (with cv) : 0.007738730008136026 r replication: -0.020071926806312746
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PMAT24_A_CR 100
r discovery (with cv) : 0.013192802109739414 r replication: -0.013371222236656505
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PMAT24_A_CR 200
r discovery (with cv) : 0.0010659245161564416 r replication: 0.00011201143802571312
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor



r discovery (with cv) : -0.003850369013039924 r replication: -0.00845621748920575
Replicability at alpha = 0.05 : nan %
Replicability at alpha = 0.01 : nan %
Replicability at alpha = 0.005 : nan %
Replicability at alpha = 0.001 : nan %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj 200




r discovery (with cv) : -0.005590010846450454 r replication: 0.006889410105047658
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj 300
r discovery (with cv) : -0.006700452912333158 r replication: 0.002094112280392216
Replicability at alpha = 0.05 : 50.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR Flanker_AgeAdj max
r discovery (with cv) : -0.0015197273477427326 r replication: 0.0019722448898129076
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats



r discovery (with cv) : 0.0068239620945981395 r replication: 0.011153896783920458
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj 200
r discovery (with cv) : -0.011921075972899122 r replication: 0.004941067063005521
Replicability at alpha = 0.05 : 33.33333333333333 %
Replicability at alpha = 0.01 : 33.33333333333333 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR CardSort_AgeAdj 300
r discovery (with cv) : 0.0005607365798034042 r replication: -0.003558093283406913
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*********************************************



r discovery (with cv) : 0.006898033564295529 r replication: 0.0019460173807176915
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PicSeq_AgeAdj 200
r discovery (with cv) : -0.015075795730530719 r replication: 0.0027040842049313287
Replicability at alpha = 0.05 : nan %
Replicability at alpha = 0.01 : nan %
Replicability at alpha = 0.005 : nan %
Replicability at alpha = 0.001 : nan %
*****************************************************************
netmats_parcor PCA_SVR PicSeq_AgeAdj 300




r discovery (with cv) : 0.012177970315613337 r replication: -0.0059592274300127425
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor PCA_SVR PicSeq_AgeAdj max
r discovery (with cv) : -0.009451875556128402 r replication: 0.002954888192621077
Replicability at alpha = 0.05 : 25.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR age 50
r discovery (with cv) : 0.02457148878600703 r replication: -0.014451749838611541
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_S



r discovery (with cv) : 0.007620898563152159 r replication: 0.00760119576863066
Replicability at alpha = 0.05 : nan %
Replicability at alpha = 0.01 : nan %
Replicability at alpha = 0.005 : nan %
Replicability at alpha = 0.001 : nan %
*****************************************************************
netmats_pearson PCA_SVR PMAT24_A_CR 300




r discovery (with cv) : 0.013826396717998821 r replication: 0.00028625928905279366
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PMAT24_A_CR max
r discovery (with cv) : 0.0012442268418255188 r replication: 0.0006788357811788911
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR Flanker_AgeAdj 50
r discovery (with cv) : -0.006951145029067808 r replication: 0.013100518056530999
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pe



r discovery (with cv) : -0.006670036406427591 r replication: -0.0012550854807045584
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CardSort_AgeAdj 200
r discovery (with cv) : -0.0070117651643217494 r replication: -0.0013590269861785199
Replicability at alpha = 0.05 : 33.33333333333333 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR CardSort_AgeAdj 300
r discovery (with cv) : 0.014291142572627688 r replication: -0.0004376003964657562
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
****************************************************



r discovery (with cv) : -0.006270139331590857 r replication: 0.0018664355000160403
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PicSeq_AgeAdj 200
r discovery (with cv) : -0.0053448490156553195 r replication: 0.0013732274285361745
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_pearson PCA_SVR PicSeq_AgeAdj 300
r discovery (with cv) : 0.010393117348453398 r replication: 0.0005262481184145888
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,netmats_parcor,PCA_SVR,age,50,-0.010709,0.919133,0.140633,0.577423,0.000999,0.157842
1,netmats_parcor,PCA_SVR,age,50,,0.791907,-0.074325,0.911089,0.000999,0.710290
2,netmats_parcor,PCA_SVR,age,50,-0.354673,0.872867,-0.253604,0.999001,0.000999,0.965035
3,netmats_parcor,PCA_SVR,age,50,-0.076261,0.886898,-0.079072,0.990010,0.000999,0.695305
4,netmats_parcor,PCA_SVR,age,50,-0.139239,0.887541,-0.021112,0.947053,0.000999,0.585415
...,...,...,...,...,...,...,...,...,...,...
5995,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.017149,0.334397,-0.004167,0.796204,0.000999,0.542458
5996,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,-0.090964,0.325431,-0.060089,0.990010,0.000999,0.899101
5997,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.073699,0.307215,-0.083865,0.239760,0.000999,0.973027
5998,netmats_pearson,PCA_SVR,PicSeq_AgeAdj,501,0.039265,0.311807,-0.081005,0.305694,0.000999,0.962038


Then with Ridge.

In [10]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'Ridge': Ridge()

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_null_Ridge.csv')

df

*****************************************************************
netmats_parcor Ridge age 50
r discovery (with cv) : -0.014348756240858655 r replication: -0.019865509971863794
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor Ridge age 100
r discovery (with cv) : 0.011020054799004328 r replication: 0.008189114686149336
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
netmats_parcor Ridge age 200
r discovery (with cv) : 0.014167298702132599 r replication: -0.004364551333239337
Replicability at alpha = 0.05 : 11.11111111111111 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,netmats_parcor,Ridge,age,50,-0.051893,1.0,0.033311,0.752248,0.000999,0.400599
1,netmats_parcor,Ridge,age,50,,1.0,-0.091510,0.916084,0.000999,0.755245
2,netmats_parcor,Ridge,age,50,0.003650,1.0,0.171662,0.682318,0.000999,0.121878
3,netmats_parcor,Ridge,age,50,-0.417308,1.0,0.002095,0.999001,0.000999,0.481518
4,netmats_parcor,Ridge,age,50,-0.063412,1.0,-0.016329,0.877123,0.000999,0.574426
...,...,...,...,...,...,...,...,...,...,...
5995,netmats_pearson,Ridge,PicSeq_AgeAdj,501,0.028744,1.0,-0.008928,0.328671,0.000999,0.594406
5996,netmats_pearson,Ridge,PicSeq_AgeAdj,501,-0.024520,1.0,0.004109,0.657343,0.000999,0.462537
5997,netmats_pearson,Ridge,PicSeq_AgeAdj,501,-0.116399,1.0,0.009268,0.991009,0.000999,0.435564
5998,netmats_pearson,Ridge,PicSeq_AgeAdj,501,0.074094,1.0,-0.033243,0.040959,0.000999,0.751249


In [18]:
df.groupby(['connectivity', 'model', 'target', 'n']).mean()[['r_discovery_cv', 'r_replication']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,r_discovery_cv,r_replication
connectivity,model,target,n,Unnamed: 4_level_1,Unnamed: 5_level_1
netmats_parcor,Ridge,CardSort_AgeAdj,50,0.031644,-0.000121
netmats_parcor,Ridge,CardSort_AgeAdj,100,0.007479,0.014062
netmats_parcor,Ridge,CardSort_AgeAdj,200,-0.020488,0.010242
netmats_parcor,Ridge,CardSort_AgeAdj,300,-0.002089,-0.001656
netmats_parcor,Ridge,CardSort_AgeAdj,500,-0.000114,-0.00192
netmats_parcor,Ridge,CogTotalComp_AgeAdj,50,0.022939,-0.003553
netmats_parcor,Ridge,CogTotalComp_AgeAdj,100,-0.002252,-0.007803
netmats_parcor,Ridge,CogTotalComp_AgeAdj,200,-0.017407,0.008146
netmats_parcor,Ridge,CogTotalComp_AgeAdj,300,-0.009672,-0.005715
netmats_parcor,Ridge,CogTotalComp_AgeAdj,495,0.00842,-0.001433


*See the notebook called 'plot_results.ipynb' for the results.*

## Now we train on the whole dataset, to obtain effect size estimates on N=1000.

With PCA+SVR:

In [11]:
model = Pipeline([('pca', PCA(n_components=0.5)), ('svr', SVR())])
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

bar_data_svr = []

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_pearson)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    bar_data_svr.append(r_disc_cv)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.21 	p = 0.001 	R2 = 4.2 %
CogTotalComp_AgeAdj
r = 0.2 	p = 0.001 	R2 = 3.9 %
PMAT24_A_CR
r = 0.21 	p = 0.001 	R2 = 4.4 %
Flanker_AgeAdj
r = 0.12 	p = 0.001 	R2 = 1.5 %
CardSort_AgeAdj
r = 0.15 	p = 0.001 	R2 = 2.1 %
PicSeq_AgeAdj
r = 0.15 	p = 0.001 	R2 = 2.2 %


With Ridge and Pearson correlation:

In [12]:
model = Ridge()
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_pearson)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.45 	p = 0.001 	R2 = 20.0 %
CogTotalComp_AgeAdj
r = 0.4 	p = 0.001 	R2 = 16.2 %
PMAT24_A_CR
r = 0.25 	p = 0.001 	R2 = 6.3 %
Flanker_AgeAdj
r = 0.16 	p = 0.001 	R2 = 2.6 %
CardSort_AgeAdj
r = 0.17 	p = 0.001 	R2 = 2.8 %
PicSeq_AgeAdj
r = 0.23 	p = 0.001 	R2 = 5.5 %


With Ridge and partial correlation:

In [13]:
model = Ridge()
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

bar_data_ridge = []

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_parcor)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    bar_data_ridge.append(r_disc_cv)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.52 	p = 0.001 	R2 = 26.7 %
CogTotalComp_AgeAdj
r = 0.5 	p = 0.001 	R2 = 25.0 %
PMAT24_A_CR
r = 0.28 	p = 0.001 	R2 = 8.1 %
Flanker_AgeAdj
r = 0.15 	p = 0.001 	R2 = 2.1 %
CardSort_AgeAdj
r = 0.24 	p = 0.001 	R2 = 5.8 %
PicSeq_AgeAdj
r = 0.17 	p = 0.001 	R2 = 2.8 %


### *See the notebook called 'plot_results_FC.ipynb' for figures.*