# Analyzing replicability of connectivity-based multivariate BWAS on the Human Connectome Project dataset

## Imports

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import KFold, train_test_split, cross_val_predict, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
from mlxtend.evaluate import permutation_test
import fnmatch
sns.set(rc={"figure.figsize":(4, 2)})
sns.set_style("whitegrid")

## Load HCP data

We load functional network matrices (netmats) from the HCP1200-release, as published on connectomeDB: https://db.humanconnectome.org/
Due to licensoing issues, data is not supplied with the repository, but can be downloaded from the ConnectomeDB.
See [hcp_data/readme.md](hcp_data/readme.md) for more details.

In [4]:
# HCP data can be obtained from the connectomeDB
# data is not part of this repository
behavior = pd.read_csv('hcp_data/hcp1200_behavioral_data.csv')
behavior = behavior.set_index('Subject', drop=True)

# convert age to numeric
age = []
for s in behavior['Age']:
    if s == '36+':
        age.append(36)
    else:
        split = s.split(sep='-')
        age.append(np.mean((float(split[0]), float(split[1]))))

behavior['age'] = age
behavior

Unnamed: 0_level_0,Release,Acquisition,Gender,Age,3T_Full_MR_Compl,T1_Count,T2_Count,3T_RS-fMRI_Count,3T_RS-fMRI_PctCompl,3T_Full_Task_fMRI,...,Odor_Unadj,Odor_AgeAdj,PainIntens_RawScore,PainInterf_Tscore,Taste_Unadj,Taste_AgeAdj,Mars_Log_Score,Mars_Errs,Mars_Final,age
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100004,S900,Q06,M,22-25,False,0,0,0,0.0,False,...,101.12,86.45,2.0,45.9,107.17,105.31,1.80,0.0,1.80,23.5
100206,S900,Q11,M,26-30,True,1,1,4,100.0,True,...,108.79,97.19,1.0,49.7,72.63,72.03,1.84,0.0,1.84,28.0
100307,Q1,Q01,F,26-30,True,1,1,4,100.0,True,...,101.12,86.45,0.0,38.6,71.69,71.76,1.76,0.0,1.76,28.0
100408,Q3,Q03,M,31-35,True,1,1,4,100.0,True,...,108.79,98.04,2.0,52.6,114.01,113.59,1.76,2.0,1.68,33.0
100610,S900,Q08,M,26-30,True,2,1,4,100.0,True,...,122.25,110.45,0.0,38.6,84.84,85.31,1.92,1.0,1.88,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992774,Q2,Q02,M,31-35,True,2,2,4,100.0,True,...,122.25,111.41,4.0,50.1,107.17,103.55,1.76,0.0,1.76,33.0
993675,S900,Q09,F,26-30,True,2,2,4,100.0,True,...,122.25,110.45,0.0,38.6,84.07,84.25,1.80,1.0,1.76,28.0
994273,S500,Q06,M,26-30,True,1,1,4,100.0,True,...,122.25,111.41,7.0,63.8,110.65,109.73,1.80,1.0,1.76,28.0
995174,S1200,Q13,M,22-25,False,1,1,2,0.0,True,...,88.61,64.58,3.0,50.1,117.16,117.40,1.80,0.0,1.80,23.5


# Function to prepare target variable


In [19]:
def create_data(target='CogTotalComp_AgeAdj', feature_data="FS_*_Thck"):
    # it's a good practice to use pandas for merging, messing up subject order can be painful

    features = fnmatch.filter(behavior.columns.values, feature_data)

    df = behavior.dropna(subset = [target] + features)
    y = df[target].values
    X = df[features].values
    return X, y

# Function implementing a single bootstrap iteration

We define a workhorse function which:
- randomly samples the discovery and the replication datasets,
- creates cross-validated estimates of predictive performance within the discovery sample
- finalizes the model by fitting it to the whole discovery sample (overfits the discovery but not the replication sample)
- use it to predict the replication sample

In [20]:
def bootstrap_workhorse(X, y, sample_size, model, random_state, shuffle_y=False):

    #create discovery and replication samples by random sampling from the whole dataset (without replacement)

    if shuffle_y:
        rng = np.random.default_rng(random_state)
        y = rng.permutation(y)

    X_discovery, X_replication, y_discovery, y_replication = train_test_split(X, y, train_size=sample_size, test_size=sample_size, shuffle=True, random_state=random_state)

    cv = KFold(10)
    # obtain cross-validated predictions in the discovery sample

    predicted_discovery_cv = np.zeros_like(y_discovery)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X_discovery, y=y_discovery):
        model.fit(X=X_discovery[train], y=y_discovery[train])
        predicted_discovery_cv[test] = model.predict(X=X_discovery[test])
        cor_per_fold[i] = np.corrcoef(y_discovery[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X_discovery, y=y_discovery)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X_discovery)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y_discovery)[0, 1]

    # We use the final model to predict the replication sample
    # This is correct (no overfitting here), the final model did not see this data during training
    predicted_replication = final_model.predict(X=X_replication)
    # we obtain the out-of-sample prediction performance estimates
    r_rep = np.corrcoef(predicted_replication, y_replication)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_rep = permutation_test(predicted_replication, y_replication, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    # return results
    return r_disc_cv, r_disc_overfit, r_rep, p_disc_cv, p_disc_overfit, p_rep

All set, now we start the analysis.

# Replicability with sample sizes n=50, 100, 200, 300 and max
Here we train a few different models on 100 bootstrap samples.

We aggregate the results of our workhorse function in `n_bootstrap`=100 bootstrap cases (run in parallel).

The whole process is repeated for all sample sizes, fetaure_sets and target variables.

## Here we test age and 5 cognitive variables, including 'cognitive ability' (the main target variable in the target paper)
- age: age group of the participants
- CogTotalComp_AgeAdj: total cognitive ability
- PMAT24_A_CR, : Fluid Intelligence (Penn Progressive Matrices)
- CardSort_AgeAdj: Executive Function/Cognitive Flexibility (Dimensional Change Card Sort)
- Flanker_AgeAdj: Executive Function/Inhibition (Flanker Task)
- PicSeq_AgeAdj: Episodic Memory (Picture Sequence Memory)

# Reproducing the PCA+SVR-based model from the target paper
### Like in the target paper:
- Both PCA and SVR are done inside the cross-validation
- PCA reatains the firts k principal components that together explain 50% of the variance
- scikit-learn makes sure that PCA is only fit for the training samples
- both for the test sets (in the cross-validation) and the replication sample PCA is not re-fit, bt features are simply transformed with the already fit PCA

In [23]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'FS_CT': "FS_*_Thck"
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set])

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_thck_PCA_SVR.csv')

df

*****************************************************************
FS_CT PCA_SVR age 50
0.08589268853895356 0.11814430558345144
Replicability at alpha = 0.05 : 30.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR age 100
0.09976372318981333 0.1321490781638016
Replicability at alpha = 0.05 : 41.17647058823529 %
Replicability at alpha = 0.01 : 5.88235294117647 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR age 200
0.1319123546390201 0.12993985943621775
Replicability at alpha = 0.05 : 59.45945945945946 %
Replicability at alpha = 0.01 : 24.324324324324326 %
Replicability at alpha = 0.005 : 16.216216216216218 %
Replicability at alpha = 0.001 : 2.7027027027027026 %
*****************************************************************
FS_C



-0.010603773264953613 -0.019130199826790952
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR CardSort_AgeAdj 50
0.03207685702787072 0.030830317612604786
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR CardSort_AgeAdj 100
0.04021060189802274 0.03077808783073594
Replicability at alpha = 0.05 : 12.5 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR CardSort_AgeAdj 200
0.04137443676857757 0.04091365957382529
Replicability at alpha = 0.05 : 22.22222222222222 %
Replicability at alph



0.011373542001246479 0.004689831017528047
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR PicSeq_AgeAdj 200
0.02151173146450509 0.022081798461224947
Replicability at alpha = 0.05 : 25.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR PicSeq_AgeAdj 300
0.01946957724852659 0.030930201944994077
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR PicSeq_AgeAdj max
0.03010521006039739 0.03567733413187921
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Rep

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,FS_CT,PCA_SVR,age,50,0.089136,0.468271,0.289675,0.612388,0.000999,0.023976
1,FS_CT,PCA_SVR,age,50,,0.551551,0.117138,0.655345,0.000999,0.215784
2,FS_CT,PCA_SVR,age,50,,0.546213,0.155825,0.959041,0.000999,0.14985
3,FS_CT,PCA_SVR,age,50,-0.178271,0.614155,-0.093113,0.982018,0.000999,0.727273
4,FS_CT,PCA_SVR,age,50,-0.032969,0.403477,0.113985,0.691309,0.003996,0.230769
...,...,...,...,...,...,...,...,...,...,...
2995,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,0.012511,0.214361,0.039045,0.706294,0.000999,0.184815
2996,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,0.083369,0.227902,0.041203,0.086913,0.000999,0.160839
2997,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,0.055206,0.225934,0.043133,0.120879,0.000999,0.148851
2998,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,-0.029972,0.153215,0.032032,0.809191,0.000999,0.227772


# Now we fit a simple Ridge regression
(no feature selection, no hyperparameter optimization)
This can be expected to perform better on low samples than SVR.

In [22]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'FS_CT': "FS_*_Thck"
}

models = {
    'ridge': Ridge()
}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set])

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_thck_Ridge.csv')

df


*****************************************************************
FS_CT ridge age 50
0.15322562774488466 0.16690836840736034
Replicability at alpha = 0.05 : 55.00000000000001 %
Replicability at alpha = 0.01 : 10.0 %
Replicability at alpha = 0.005 : 5.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT ridge age 100
0.15399594921982113 0.1668875020529473
Replicability at alpha = 0.05 : 58.97435897435898 %
Replicability at alpha = 0.01 : 12.82051282051282 %
Replicability at alpha = 0.005 : 2.564102564102564 %
Replicability at alpha = 0.001 : 2.564102564102564 %
*****************************************************************
FS_CT ridge age 200
0.2227858933204806 0.22609433664584924
Replicability at alpha = 0.05 : 94.31818181818183 %
Replicability at alpha = 0.01 : 65.9090909090909 %
Replicability at alpha = 0.005 : 52.27272727272727 %
Replicability at alpha = 0.001 : 26.136363636363637 %
************************************

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,FS_CT,ridge,age,50,0.218019,0.861656,0.176967,0.193806,0.000999,0.125874
1,FS_CT,ridge,age,50,,0.73807,0.016571,0.172827,0.000999,0.492507
2,FS_CT,ridge,age,50,,0.784162,0.073507,0.995005,0.000999,0.318681
3,FS_CT,ridge,age,50,0.03394,0.831648,-0.006591,0.682318,0.000999,0.511489
4,FS_CT,ridge,age,50,0.153336,0.823246,0.126225,0.406593,0.000999,0.1998
...,...,...,...,...,...,...,...,...,...,...
2995,FS_CT,ridge,PicSeq_AgeAdj,556,0.206691,0.440568,0.073023,0.000999,0.000999,0.041958
2996,FS_CT,ridge,PicSeq_AgeAdj,556,0.168531,0.408207,0.088871,0.000999,0.000999,0.021978
2997,FS_CT,ridge,PicSeq_AgeAdj,556,0.038784,0.367894,0.098458,0.212787,0.000999,0.004995
2998,FS_CT,ridge,PicSeq_AgeAdj,556,0.113999,0.397635,0.089869,0.004995,0.000999,0.021978


# Null scenario with random target
To evaluate false positives with biased estimates

In [24]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'FS_CT': "FS_*_Thck"
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_thck_null_PCA_SVR.csv')

df

*****************************************************************
FS_CT PCA_SVR age 50
-0.006672762525657837 -0.007750463437365226
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR age 100
-0.0054298235725445355 0.007488203892819826
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR age 200
0.0006834925110597672 0.003009677488480728
Replicability at alpha = 0.05 : 20.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR age 300
-0.0021544089614625975 -0.006674668177288535
Replicabil



-0.0019681493786523753 0.01594666888263921
Replicability at alpha = 0.05 : 66.66666666666666 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR Flanker_AgeAdj 300
0.0021855114322033033 0.0020920437201013766
Replicability at alpha = 0.05 : 40.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR Flanker_AgeAdj max
-0.002452461838318572 0.0021759495908249722
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR CardSort_AgeAdj 50
0.010612277564530275 0.02340110949596856
Replicability at alpha = 0.05 : 0.0 %
Replicability at



0.010951429727356155 -0.007800783898997811
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR PicSeq_AgeAdj 200
0.003510008843708663 -0.0054599636733647415
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR PicSeq_AgeAdj 300
0.008344196030507038 0.0011190906096572842
Replicability at alpha = 0.05 : 33.33333333333333 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT PCA_SVR PicSeq_AgeAdj max
-0.004087055146693218 -0.0036189329291476465
Replicability at alpha = 0.05 : 0.0 %
Replicability at 

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,FS_CT,PCA_SVR,age,50,,0.546397,-0.173968,0.925075,0.000999,0.906094
1,FS_CT,PCA_SVR,age,50,0.059573,0.512302,0.148245,0.523477,0.000999,0.146853
2,FS_CT,PCA_SVR,age,50,-0.057247,0.287055,-0.117199,0.976024,0.022977,0.791209
3,FS_CT,PCA_SVR,age,50,-0.425656,0.554909,0.021776,0.999001,0.000999,0.45954
4,FS_CT,PCA_SVR,age,50,0.406026,0.623122,-0.073094,0.064935,0.000999,0.695305
...,...,...,...,...,...,...,...,...,...,...
2995,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,-0.059353,0.174407,-0.014806,0.992008,0.000999,0.631369
2996,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,-0.01275,0.226049,-0.064682,0.921079,0.000999,0.929071
2997,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,0.023795,0.193551,-0.064718,0.436563,0.000999,0.943057
2998,FS_CT,PCA_SVR,PicSeq_AgeAdj,556,-0.056596,0.195581,0.066811,0.962038,0.000999,0.054945


In [25]:
%%time

random_state = 42
n_bootstrap = 100

features = {
    'FS_CT': "FS_*_Thck"
}

models = {
    'Ridge': Ridge()

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print(tmp_data_frame.r_discovery_cv.mean(), tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_thck_null_Ridge.csv')

df

*****************************************************************
FS_CT Ridge age 50
0.011551371488371967 0.009634153435139901
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT Ridge age 100
0.016002537969113658 0.003275680371646046
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT Ridge age 200
0.002310138893259383 -0.0017354115125003684
Replicability at alpha = 0.05 : 0.0 %
Replicability at alpha = 0.01 : 0.0 %
Replicability at alpha = 0.005 : 0.0 %
Replicability at alpha = 0.001 : 0.0 %
*****************************************************************
FS_CT Ridge age 300
0.0027130085033840023 0.0011849069308048385
Replicability at alpha 

Unnamed: 0,connectivity,model,target,n,r_discovery_cv,r_discovery_overfit,r_replication,p_discovery_cv,p_discovery_overfit,p_replication
0,FS_CT,Ridge,age,50,,0.806797,0.216026,0.828172,0.000999,0.068931
1,FS_CT,Ridge,age,50,-0.183389,0.780715,0.027967,0.939061,0.000999,0.404595
2,FS_CT,Ridge,age,50,-0.237258,0.753463,0.070626,0.943057,0.000999,0.331668
3,FS_CT,Ridge,age,50,0.143637,0.834714,0.047186,0.472527,0.000999,0.380619
4,FS_CT,Ridge,age,50,0.098473,0.843384,-0.059921,0.626374,0.000999,0.646354
...,...,...,...,...,...,...,...,...,...,...
2995,FS_CT,Ridge,PicSeq_AgeAdj,556,0.06367,0.37769,-0.097926,0.068931,0.000999,0.984016
2996,FS_CT,Ridge,PicSeq_AgeAdj,556,-0.060496,0.318082,-0.003332,0.934066,0.000999,0.548452
2997,FS_CT,Ridge,PicSeq_AgeAdj,556,-0.167463,0.284611,0.038488,1.0,0.000999,0.190809
2998,FS_CT,Ridge,PicSeq_AgeAdj,556,0.018009,0.341041,0.039556,0.396603,0.000999,0.168831


*See the notebook called 'plot_results.ipynb' for the results.*