
# Analyzing replicability of functional connectivity-based multivariate BWAS on the Human Connectome Project dataset

This notebook contains the main analyses of the Matters Arising, with 6 example phenotypes.
For a comprehensive analysis of 52 HCP phenotypes, see `multivariate_BWAS_replicability_analysis_FC_extensive.ipynb`.
For an analysis with finer sample size resolution, see `multivariate_BWAS_replicability_analysis_FC_hires.ipynb`.
For an analysis with cortical thickness data, see `multivariate_BWAS_replicability_analysis_CT.ipynb`.

## Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
from mlxtend.evaluate import permutation_test
sns.set(rc={"figure.figsize":(4, 2)})
sns.set_style("whitegrid")

## Load HCP data

We load functional network matrices (netmats) from the HCP1200-release, as published on connectomeDB: https://db.humanconnectome.org/
Due to licensing issues, data is not supplied with the repository, but can be downloaded from the ConnectomeDB or via `get_data.ipynb` (requires credentials).
See [readme.md](readme.md) for more details.

In [2]:
# HCP data can be obtained from the connectomeDB
# data is not part of this repository
subjectIDs = pd.read_csv('hcp_data/subjectIDs.txt', header=None)

netmats_pearson = pd.read_csv('hcp_data/netmats1_correlationZ.txt',
                             sep=' ',
                             header=None)
netmats_pearson['ID'] = subjectIDs[0]
netmats_pearson.set_index('ID', drop=True, inplace=True)


netmats_parcor = pd.read_csv('hcp_data/netmats2_partial-correlation.txt',
                             sep=' ',
                             header=None)
netmats_parcor['ID'] = subjectIDs[0]
netmats_parcor.set_index('ID', drop=True, inplace=True)

behavior = pd.read_csv('hcp_data/hcp1200_behavioral_data.csv')
behavior = behavior.set_index('Subject', drop=True)

# convert age to numeric
age = []
for s in behavior['Age']:
    if s == '36+':
        age.append(36)
    else:
        split = s.split(sep='-')
        age.append(np.mean((float(split[0]), float(split[1]))))

behavior['age'] = age
behavior.describe()

Unnamed: 0,T1_Count,T2_Count,3T_RS-fMRI_Count,3T_RS-fMRI_PctCompl,3T_tMRI_PctCompl,fMRI_WM_PctCompl,fMRI_Gamb_PctCompl,fMRI_Mot_PctCompl,fMRI_Lang_PctCompl,fMRI_Soc_PctCompl,...,Odor_Unadj,Odor_AgeAdj,PainIntens_RawScore,PainInterf_Tscore,Taste_Unadj,Taste_AgeAdj,Mars_Log_Score,Mars_Errs,Mars_Final,age
count,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,...,1204.0,1204.0,1201.0,1205.0,1200.0,1200.0,1198.0,1195.0,1195.0,1206.0
mean,1.478441,1.400498,3.50995,87.213267,88.219569,89.718076,89.852736,89.631675,87.027114,87.039801,...,110.421321,97.7275,1.449625,45.847718,95.166983,93.998533,1.845467,0.58159,1.822251,28.904229
std,0.635688,0.628216,1.215181,31.027886,29.942161,30.384864,30.163559,30.492092,33.566043,33.570248,...,9.107963,11.273251,1.783069,7.679288,14.583412,14.837851,0.541393,0.973172,0.542893,3.570475
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.74,59.86,0.0,38.6,56.35,59.5,1.56,0.0,1.08,23.5
25%,1.0,1.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,101.12,87.11,0.0,38.6,84.07,83.22,1.8,0.0,1.76,28.0
50%,2.0,1.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,108.79,98.04,1.0,45.9,95.36,94.97,1.8,0.0,1.8,28.0
75%,2.0,2.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,122.25,110.45,2.0,52.2,105.57,102.92,1.88,1.0,1.84,33.0
max,2.0,2.0,4.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,122.25,111.41,10.0,75.3,134.65,131.38,15.0,17.0,15.0,36.0


# Helper function to prepare target variable


In [3]:
def create_data(target='CogTotalComp_AgeAdj', feature_data=netmats_parcor):
    # it's a good practice to use pandas for merging, messing up subject order can be painful
    features = feature_data.columns
    df = behavior
    df = df.merge(feature_data, left_index=True, right_index=True, how='left')

    df = df.dropna(subset = [target] + features.values.tolist())
    y = df[target].values
    X = df[features].values
    return X, y

# Helper function implementing a single bootstrap iteration

We define a workhorse function which:
- randomly samples the discovery and the replication datasets,
- creates cross-validated estimates of predictive performance within the discovery sample
- finalizes the model by fitting it to the whole discovery sample (overfits the discovery but not the replication sample)
- use it to predict the replication sample

In [4]:
def corr(X, Y):
    # just a small wrapper function (pandas correlation is silent in "unlucky" bootstraps with constant values)
    return pd.Series(X).corr( pd.Series(Y))

def bootstrap_workhorse(X, y, sample_size, model, random_state, shuffle_y=False):

    #create discovery and replication samples by random sampling from the whole dataset (without replacement)

    # if shuffle_y is true, a null model is created bz permuting y
    if shuffle_y:
        rng = np.random.default_rng(random_state)
        y = rng.permutation(y)

    # sample the discovery and replication sets *without replacement* (with replacement introduces spurious dependencies)
    X_discovery, X_replication, y_discovery, y_replication = train_test_split(X, y, train_size=sample_size, test_size=sample_size, shuffle=True, random_state=random_state)

    # standard 10-fold cross-validation
    cv = KFold(10)

    # below we obtain cross-validated predictions in the discovery sample
    predicted_discovery_cv = np.zeros_like(y_discovery)  # here we collect the predictions for each fold
    cor_per_fold = np.zeros(cv.n_splits)  # here we collect the predictive performance in each fold
    i = 0  # just a counter
    for train, test in cv.split(X=X_discovery, y=y_discovery):  # loop to leave one fold out
        model.fit(X=X_discovery[train], y=y_discovery[train]) # fit model to the training set
        predicted_discovery_cv[test] = model.predict(X=X_discovery[test]) # use fitted model to predict teh test set
        cor_per_fold[i] = corr(y_discovery[test], predicted_discovery_cv[test]) # calculate performance on tne test set
        i += 1
    # calculate mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # 'finalize' model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X_discovery, y=y_discovery)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X_discovery)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = corr(predicted_discovery_overfit, y_discovery)

    # We use the final model to predict the replication sample
    # This is correct (no overfitting here), the final model did not see this data during training
    predicted_replication = final_model.predict(X=X_replication)
    # we obtain the out-of-sample prediction performance estimates
    r_rep = corr(predicted_replication, y_replication)

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: corr(x, y),seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y_discovery, method='approximate', num_rounds=1000, func=lambda x, y: corr(x, y),seed=random_state)
    p_rep = permutation_test(predicted_replication, y_replication, method='approximate', num_rounds=1000, func=lambda x, y: corr(x, y),seed=random_state)
    # return results
    return r_disc_cv, r_disc_overfit, r_rep, p_disc_cv, p_disc_overfit, p_rep

All set, now we start the analysis.

# Replicability with sample sizes n=50, 100, 200, 300 and max
Here we train a few different models on 100 bootstrap samples.

We aggregate the results of our workhorse function in `n_bootstrap`=100 bootstrap cases (run in parallel).

The whole process is repeated for all sample sizes, fetaure_sets and target variables.

## Here we test age and 5 cognitive variables, including 'cognitive ability' (the main target variable in the target paper)
- age: age group of the participants
- CogTotalComp_AgeAdj: total cognitive ability
- PMAT24_A_CR, : Fluid Intelligence (Penn Progressive Matrices)
- CardSort_AgeAdj: Executive Function/Cognitive Flexibility (Dimensional Change Card Sort)
- Flanker_AgeAdj: Executive Function/Inhibition (Flanker Task)
- PicSeq_AgeAdj: Episodic Memory (Picture Sequence Memory)

# Reproducing the PCA+SVR-based model from the target paper
- Both PCA and SVR are done inside the cross-validation
- PCA reatains the firts k principal components that together explain 50% of the variance
- scikit-learn makes sure that PCA is only fit for the training samples
- both for the test sets (in the cross-validation) and the replication sample PCA is not re-fit, bt features are simply transformed with the already fit PCA

In [5]:
%%time
%%capture

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]);

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_PCA_SVR.csv')

CPU times: user 46.5 s, sys: 6.18 s, total: 52.7 s
Wall time: 3h 21min 54s


In [6]:
df.groupby(['connectivity', 'model', 'target', 'n']).mean()[['r_discovery_cv', 'r_replication']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,r_discovery_cv,r_replication
connectivity,model,target,n,Unnamed: 4_level_1,Unnamed: 5_level_1
netmats_parcor,PCA_SVR,CardSort_AgeAdj,50,0.021165,0.060743
netmats_parcor,PCA_SVR,CardSort_AgeAdj,100,0.077595,0.08297
netmats_parcor,PCA_SVR,CardSort_AgeAdj,200,0.097733,0.102561
netmats_parcor,PCA_SVR,CardSort_AgeAdj,300,0.122905,0.123971
netmats_parcor,PCA_SVR,CardSort_AgeAdj,500,0.139694,0.14955
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,50,0.128129,0.140322
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,100,0.190058,0.212629
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,200,0.262374,0.271359
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,300,0.303186,0.310136
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,495,0.337404,0.346112


# Now we fit a simple Ridge regression
(no feature selection, no hyperparameter optimization)
Based on some previous studies, this can be expected to perform better than SVR (especially with lower sample sizes). See the paper for details.

In [7]:
%%time
%%capture

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'ridge': Ridge()
}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set])

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })
                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_Ridge.csv')

CPU times: user 40.8 s, sys: 5.54 s, total: 46.3 s
Wall time: 27min 10s


In [8]:
df.groupby(['connectivity', 'model', 'target', 'n']).mean()[['r_discovery_cv', 'r_replication']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,r_discovery_cv,r_replication
connectivity,model,target,n,Unnamed: 4_level_1,Unnamed: 5_level_1
netmats_parcor,ridge,CardSort_AgeAdj,50,0.033223,0.090787
netmats_parcor,ridge,CardSort_AgeAdj,100,0.073555,0.101448
netmats_parcor,ridge,CardSort_AgeAdj,200,0.119109,0.142017
netmats_parcor,ridge,CardSort_AgeAdj,300,0.164847,0.167188
netmats_parcor,ridge,CardSort_AgeAdj,500,0.178966,0.189577
netmats_parcor,ridge,CogTotalComp_AgeAdj,50,0.212834,0.256673
netmats_parcor,ridge,CogTotalComp_AgeAdj,100,0.308142,0.324413
netmats_parcor,ridge,CogTotalComp_AgeAdj,200,0.404524,0.406206
netmats_parcor,ridge,CogTotalComp_AgeAdj,300,0.431054,0.44389
netmats_parcor,ridge,CogTotalComp_AgeAdj,495,0.472652,0.479039


# Null scenario with random target
To evaluate false positives with biased estimates (Figure 1d).

First for PCA+SVR.

In [9]:
%%time
%%capture

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'PCA_SVR': Pipeline([('pca', PCA(n_components=0.5)),
                         ('svr', SVR())])

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_null_PCA_SVR.csv')

df

CPU times: user 46.5 s, sys: 6 s, total: 52.5 s
Wall time: 3h 16min 32s


In [10]:
df.groupby(['connectivity', 'model', 'target', 'n']).mean()[['r_discovery_cv', 'r_replication']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,r_discovery_cv,r_replication
connectivity,model,target,n,Unnamed: 4_level_1,Unnamed: 5_level_1
netmats_parcor,PCA_SVR,CardSort_AgeAdj,50,0.028683,0.002786
netmats_parcor,PCA_SVR,CardSort_AgeAdj,100,0.006824,0.011154
netmats_parcor,PCA_SVR,CardSort_AgeAdj,200,-0.011921,0.004941
netmats_parcor,PCA_SVR,CardSort_AgeAdj,300,0.000561,-0.003558
netmats_parcor,PCA_SVR,CardSort_AgeAdj,500,-0.002693,0.004389
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,50,-0.001601,0.004007
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,100,0.005333,-0.01209
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,200,-0.001798,-0.012032
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,300,0.008406,-0.009813
netmats_parcor,PCA_SVR,CogTotalComp_AgeAdj,495,-0.007775,-0.002075


Then with Ridge.

In [11]:
%%time
%%capture

random_state = 42
n_bootstrap = 100

features = {
    'netmats_parcor': netmats_parcor,
    'netmats_pearson': netmats_pearson
}

models = {
    'Ridge': Ridge()

}

# We aggregate all results here:
df = pd.DataFrame(columns=['connectivity','model','target','n','r_discovery_cv','r_discovery_overfit','r_replication','p_discovery_cv','p_discovery_overfit','p_replication'])

for feature_set in features:
    for model in models:
        for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
            for sample_size in [50, 100, 200, 300, 'max']:

                print('*****************************************************************')
                print(feature_set, model, target_var, sample_size)

                X, y = create_data(target=target_var, feature_data=features[feature_set]) # gives a random y when target is None

                if sample_size=='max':
                    sample_size = int(len(y)/2)

                # create random seeds for each bootstrap iteration for reproducibility
                rng = np.random.default_rng(random_state)
                random_sates = rng.integers(np.iinfo(np.int32).max, size=n_bootstrap)

                # run bootstrap iterations in parallel, with shuffle_y=True
                r_discovery_cv, r_discovery_overfit, r_replication, p_discovery_cv, p_discovery_overfit, p_replication = zip(
                    *Parallel(n_jobs=-1)(
                    delayed(bootstrap_workhorse)(X, y, sample_size, models[model], seed, shuffle_y=True) for seed in random_sates))

                tmp_data_frame = pd.DataFrame({
                    'connectivity' : feature_set,
                    'model' : model,
                    'target' : target_var,
                    'n' : sample_size,
                    'r_discovery_cv': r_discovery_cv,
                    'r_discovery_overfit': r_discovery_overfit,
                    'r_replication': r_replication,
                    'p_discovery_cv': p_discovery_cv,
                    'p_discovery_overfit': p_discovery_overfit,
                    'p_replication': p_replication
                })

                #sns.scatterplot(x='r_replication', y='r_discovery_cv', data=tmp_data_frame)
                #plt.ylabel('in-sample (r)')
                #plt.xlabel('out-of-sample (r_pred)')
                #plt.show()
                print('r discovery (with cv) :', tmp_data_frame.r_discovery_cv.mean(), 'r replication:', tmp_data_frame.r_replication.mean())

                for alpha in [0.05, 0.01, 0.005, 0.001]:
                    print('Replicability at alpha =', alpha, ':',
                          (tmp_data_frame.loc[tmp_data_frame['p_discovery_cv']<alpha,'p_replication']<alpha).sum() / (tmp_data_frame['p_discovery_cv']<0.05).sum() * 100, '%')

                df = pd.concat((df, tmp_data_frame))
                df.reset_index(drop=True, inplace=True)
                df.to_csv('res/results_null_Ridge.csv')

df

CPU times: user 41.3 s, sys: 5.51 s, total: 46.8 s
Wall time: 26min 40s


In [12]:
df.groupby(['connectivity', 'model', 'target', 'n']).mean()[['r_discovery_cv', 'r_replication']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,r_discovery_cv,r_replication
connectivity,model,target,n,Unnamed: 4_level_1,Unnamed: 5_level_1
netmats_parcor,Ridge,CardSort_AgeAdj,50,0.031644,-0.000121
netmats_parcor,Ridge,CardSort_AgeAdj,100,0.007479,0.014062
netmats_parcor,Ridge,CardSort_AgeAdj,200,-0.020488,0.010242
netmats_parcor,Ridge,CardSort_AgeAdj,300,-0.002089,-0.001656
netmats_parcor,Ridge,CardSort_AgeAdj,500,-0.000114,-0.00192
netmats_parcor,Ridge,CogTotalComp_AgeAdj,50,0.022939,-0.003553
netmats_parcor,Ridge,CogTotalComp_AgeAdj,100,-0.002252,-0.007803
netmats_parcor,Ridge,CogTotalComp_AgeAdj,200,-0.017407,0.008146
netmats_parcor,Ridge,CogTotalComp_AgeAdj,300,-0.009672,-0.005715
netmats_parcor,Ridge,CogTotalComp_AgeAdj,495,0.00842,-0.001433


*See the notebook called 'plot_results.ipynb' for the results.*

## Now we train on the whole dataset, to obtain effect size estimates on N=1000.

With PCA+SVR:

In [13]:
model = Pipeline([('pca', PCA(n_components=0.5)), ('svr', SVR())])
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

bar_data_svr = []

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_pearson)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    bar_data_svr.append(r_disc_cv)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.21 	p = 0.001 	R2 = 4.2 %
CogTotalComp_AgeAdj
r = 0.2 	p = 0.001 	R2 = 3.9 %
PMAT24_A_CR
r = 0.21 	p = 0.001 	R2 = 4.4 %
Flanker_AgeAdj
r = 0.12 	p = 0.001 	R2 = 1.5 %
CardSort_AgeAdj
r = 0.15 	p = 0.001 	R2 = 2.1 %
PicSeq_AgeAdj
r = 0.15 	p = 0.001 	R2 = 2.2 %


With Ridge and Pearson correlation:

In [14]:
model = Ridge()
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_pearson)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.45 	p = 0.001 	R2 = 20.0 %
CogTotalComp_AgeAdj
r = 0.4 	p = 0.001 	R2 = 16.2 %
PMAT24_A_CR
r = 0.25 	p = 0.001 	R2 = 6.3 %
Flanker_AgeAdj
r = 0.16 	p = 0.001 	R2 = 2.6 %
CardSort_AgeAdj
r = 0.17 	p = 0.001 	R2 = 2.8 %
PicSeq_AgeAdj
r = 0.23 	p = 0.001 	R2 = 5.5 %


With Ridge and partial correlation:

In [15]:
model = Ridge()
random_state = 42
cv = KFold(10, shuffle=True, random_state=random_state)

bar_data_ridge = []

for target_var in ['age', 'CogTotalComp_AgeAdj', 'PMAT24_A_CR', 'Flanker_AgeAdj', 'CardSort_AgeAdj', 'PicSeq_AgeAdj']:
    print(target_var)
    X, y = create_data(target=target_var, feature_data=netmats_parcor)

    predicted_discovery_cv = np.zeros_like(y)
    cor_per_fold = np.zeros(cv.n_splits)
    i = 0
    for train, test in cv.split(X=X, y=y):
        model.fit(X=X[train], y=y[train])
        predicted_discovery_cv[test] = model.predict(X=X[test])
        cor_per_fold[i] = np.corrcoef(y[test], predicted_discovery_cv[test])[0,1]
        i += 1
    # correlation between the cross-validated predictions and observations in the discovery sample
    # this is the correct, unbiased estimate!
    # calculated as mean test performance across all folds
    r_disc_cv = np.mean(cor_per_fold)
    # finalize model by training it on the full discovery sample (without cross-validation)
    final_model = model.fit(X=X, y=y)
    # obtain predictions with the final model on the discovery sample, note that this model actually overfits this sample.
    # we do this only to demonstrate biased estimates
    predicted_discovery_overfit = final_model.predict(X=X)
    # here we obtain the biased effect size (r) estimates for demonstrational purposes
    r_disc_overfit = np.corrcoef(predicted_discovery_overfit, y)[0, 1]

    # below we calculate permutation-based p-values for all three effect size estimates (in-sample unbiased, in-sample biased, out-of-sample)
    # (one sided tests, testing for positive correlation)
    p_disc_cv = permutation_test(predicted_discovery_cv, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)
    p_disc_overfit = permutation_test(predicted_discovery_overfit, y, method='approximate', num_rounds=1000, func=lambda x, y: np.corrcoef(x, y)[1][0],seed=random_state)

    bar_data_ridge.append(r_disc_cv)

    print('r =', np.round(r_disc_cv, 2), '\tp =', np.round(p_disc_cv, 3), '\tR2 =', np.round(r_disc_cv**2 * 100, 1),  '%')

age
r = 0.52 	p = 0.001 	R2 = 26.7 %
CogTotalComp_AgeAdj
r = 0.5 	p = 0.001 	R2 = 25.0 %
PMAT24_A_CR
r = 0.28 	p = 0.001 	R2 = 8.1 %
Flanker_AgeAdj
r = 0.15 	p = 0.001 	R2 = 2.1 %
CardSort_AgeAdj
r = 0.24 	p = 0.001 	R2 = 5.8 %
PicSeq_AgeAdj
r = 0.17 	p = 0.001 	R2 = 2.8 %


### *See the notebook called 'plot_results_FC.ipynb' for figures.*