In [2]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats, linalg
from sklearn import preprocessing, decomposition, linear_model, metrics 
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings("ignore")

In [320]:
# Yeo functional networks
hcp_yeo_z = np.load('data/hcp_yeo_z.npy')
hcp_yeo_resid = np.load('data/hcp_yeo_resid.npy')
hcp_yeo_g = np.load('data/hcp_yeo_g.npy')

In [321]:
# generate train/test splits
np.random.seed(42)
n_train = int(0.9 * hcp_yeo_z.shape[0])

In [322]:
train_idxs = np.random.choice(range(hcp_yeo_z.shape[0]), size=n_train, replace=False)
test_idxs = np.array([x for x in range(hcp_yeo_z.shape[0]) if x not in train_idxs])

In [323]:
train_data_z = hcp_yeo_z[train_idxs, :]
test_data_z = hcp_yeo_z[test_idxs, :]

train_data_raw = hcp_yeo_resid[train_idxs, :]
test_data_raw = hcp_yeo_resid[test_idxs, :]

train_phen = hcp_yeo_g[train_idxs]
test_phen = hcp_yeo_g[test_idxs]

In [None]:
# mean center train/test data (using train means)
train_mu_centered_z = (train_data_z - train_data_z.mean(axis=0))
test_mu_centered_z = (test_data_z - train_data_z.mean(axis=0))

train_mu_centered_raw = (train_data_raw - train_data_raw.mean(axis=0))
test_mu_centered_raw = (test_data_raw - train_data_raw.mean(axis=0))

# from pca documentation, "the input data is centered but not scaled for each feature before applying the SVD"
pca_model_z = decomposition.PCA(n_components=15).fit(train_data_z)
pca_model_raw = decomposition.PCA(n_components=15).fit(train_data_raw)

train_transformed_z = pca_model_z.transform(train_data_z)
test_transformed_z = pca_model_z.transform(test_data_z)
train_transformed_raw = pca_model_raw.transform(train_data_raw)
test_transformed_raw = pca_model_raw.transform(test_data_raw)

# OLS using sklearn
lr_model_z = linear_model.LinearRegression(fit_intercept=True, normalize=False)
lr_model_z.fit(train_transformed_z, train_phen)
train_pred_phen_lr_model_z = lr_model_z.predict(train_transformed_z)
test_pred_phen_lr_model_z = lr_model_z.predict(test_transformed_z)

# OLS using sklearn
lr_model_raw = linear_model.LinearRegression(fit_intercept=True, normalize=False)
lr_model_raw.fit(train_transformed_raw, train_phen)
train_pred_phen_lr_model_raw = lr_model_raw.predict(train_transformed_raw)
test_pred_phen_lr_model_raw = lr_model_raw.predict(test_transformed_raw)

# HCP Accuracy of Predictions (deviations)
train_r2_z = metrics.r2_score(train_phen, train_pred_phen_lr_model_z)
test_r2_z = metrics.r2_score(test_phen, test_pred_phen_lr_model_z)
train_mse_z = metrics.mean_squared_error(train_phen, train_pred_phen_lr_model_z)
test_mse_z = metrics.mean_squared_error(test_phen, test_pred_phen_lr_model_z)

# HCP Accuracy of Predictions (raw)
train_r2_raw = metrics.r2_score(train_phen, train_pred_phen_lr_model_raw)
test_r2_raw = metrics.r2_score(test_phen, test_pred_phen_lr_model_raw)
train_mse_raw = metrics.mean_squared_error(train_phen, train_pred_phen_lr_model_raw)
test_mse_raw = metrics.mean_squared_error(test_phen, test_pred_phen_lr_model_raw)

# Difference between deviation and raw
diff_test_r = test_r2_z - test_r2_raw
diff_test_mse = test_mse_z - test_mse_raw

In [None]:
# Cortical Thickness
hcp_ct_z = np.load('data/hcp_ct_z.npy')
hcp_ct_resid = np.load('data/hcp_ct_resid.npy')
hcp_ct_g = np.load('data/hcp_ct_g.npy')

In [None]:
# generate train/test splits
np.random.seed(42)
n_train = int(0.9 * hcp_ct_z.shape[0])

In [None]:
train_idxs = np.random.choice(range(hcp_ct_z.shape[0]), size=n_train, replace=False)
test_idxs = np.array([x for x in range(hcp_ct_z.shape[0]) if x not in train_idxs])

In [None]:
train_data_z = hcp_ct_z[train_idxs, :]
test_data_z = hcp_ct_z[test_idxs, :]

train_data_raw = hcp_ct_resid[train_idxs, :]
test_data_raw = hcp_ct_resid[test_idxs, :]

train_phen = hcp_ct_g[train_idxs]
test_phen = hcp_ct_g[test_idxs]

In [None]:
# mean center train/test data (using train means)
train_mu_centered_z = (train_data_z - train_data_z.mean(axis=0))
test_mu_centered_z = (test_data_z - train_data_z.mean(axis=0))

train_mu_centered_raw = (train_data_raw - train_data_raw.mean(axis=0))
test_mu_centered_raw = (test_data_raw - train_data_raw.mean(axis=0))

# from pca documentation, "the input data is centered but not scaled for each feature before applying the SVD"
pca_model_z = decomposition.PCA(n_components=15).fit(train_data_z)
pca_model_raw = decomposition.PCA(n_components=15).fit(train_data_raw)

train_transformed_z = pca_model_z.transform(train_data_z)
test_transformed_z = pca_model_z.transform(test_data_z)
train_transformed_raw = pca_model_raw.transform(train_data_raw)
test_transformed_raw = pca_model_raw.transform(test_data_raw)

# OLS using sklearn
lr_model_z = linear_model.LinearRegression(fit_intercept=True, normalize=False)
lr_model_z.fit(train_transformed_z, train_phen)
train_pred_phen_lr_model_z = lr_model_z.predict(train_transformed_z)
test_pred_phen_lr_model_z = lr_model_z.predict(test_transformed_z)

# OLS using sklearn
lr_model_raw = linear_model.LinearRegression(fit_intercept=True, normalize=False)
lr_model_raw.fit(train_transformed_raw, train_phen)
train_pred_phen_lr_model_raw = lr_model_raw.predict(train_transformed_raw)
test_pred_phen_lr_model_raw = lr_model_raw.predict(test_transformed_raw)

# HCP Accuracy of Predictions (deviations)
train_r2_z = metrics.r2_score(train_phen, train_pred_phen_lr_model_z)
test_r2_z = metrics.r2_score(test_phen, test_pred_phen_lr_model_z)
train_mse_z = metrics.mean_squared_error(train_phen, train_pred_phen_lr_model_z)
test_mse_z = metrics.mean_squared_error(test_phen, test_pred_phen_lr_model_z)

# HCP Accuracy of Predictions (raw)
train_r2_raw = metrics.r2_score(train_phen, train_pred_phen_lr_model_raw)
test_r2_raw = metrics.r2_score(test_phen, test_pred_phen_lr_model_raw)
train_mse_raw = metrics.mean_squared_error(train_phen, train_pred_phen_lr_model_raw)
test_mse_raw = metrics.mean_squared_error(test_phen, test_pred_phen_lr_model_raw)

# Difference between deviation and raw
diff_test_r = test_r2_z - test_r2_raw
diff_test_mse = test_mse_z - test_mse_raw