# [Predictive Clinical Neuroscience Toolkit](https://github.com/amarquand/PCNtoolkit) 
# Normative Modeling Tutorial Using Multi-Site Cortical Thickness Data
# Part 2: Run the normative models

### Created by [Saige Rutherford](https://twitter.com/being_saige) 

<div>
<img src="data/NormModelSetup.png" width="500"/>
</div>

In [None]:
# re-run if you did not run part 1 notebook
! wget -nc https://raw.githubusercontent.com/saigerutherford/PCNToolkit-demo/blob/main/requirements.txt

In [None]:
# re-run if you did not run part 1 notebook
! pip install -r requirements.txt

In [None]:
! wget -nc https://raw.githubusercontent.com/saigerutherford/PCNToolkit-demo/blob/main/data/covariate_files/*.txt

In [None]:
! wget -nc https://raw.githubusercontent.com/saigerutherford/PCNToolkit-demo/blob/main/data/response_files/*.txt

# Step 5: Run normative model

In [54]:
# set this path to wherever your ROI_models folder is located (where you copied all of the covariate & response text files to in Part 1)
data_dir = '/path/to/ROI_models/folder'

In [55]:
# Create a cubic B-spline basis (used for regression)
xmin = 16 # Xmin & Xmax are the boundaries for ages of participants in the dataset
xmax = 90
B = create_bspline_basis(xmin, xmax)

[Reference for why we use B-spline in our design matrix](https://www.sciencedirect.com/science/article/abs/pii/S1053811910000832?via%3Dihub)

[Reference for what a B-spline is](https://cran.r-project.org/web/packages/crs/vignettes/spline_primer.pdf)

In [56]:
# Create a list of all the ROIs you want to run a normative model for
roi_ids = ['lh_bankssts_thickness',
       'lh_caudalanteriorcingulate_thickness']

When we split the data into train and test sets, we did not reset the index. This means that the row numbers in the train/test matrices are still the same as before splitting the data. We will need the test set row numbers of which subjects belong to which site in order to evaluate per site performance metrics, so we need to reset the row numbers in the train/test split matrices.

In [None]:
X_train = pd.read_csv('data/covariate_files/cov_tr.txt', sep='\t', header=None)
X_test = pd.read_csv('data/covariate_files/cov_te.txt', sep='\t', header=None)
y_train = pd.read_csv('data/response_files/cov_tr.txt', sep='\t', header=None)
y_test = pd.read_csv('data/response_files/cov_te.txt', sep='\t', header=None)

In [57]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [58]:
# Get indices of all the subejcts in each site so that we can evaluate the test set metrics per site
cam_idx = X_test.index[X_test['site_cam' ]== 1].to_list()
hcp_idx = X_test.index[X_test['site_hcp'] == 1].to_list()
ixi_idx = X_test.index[X_test['site_ixi'] == 1].to_list()

In [59]:
# Save the site indices into a single list
sites = [cam_idx, hcp_idx, ixi_idx]

In [60]:
# Create a list with sites names to use in evaluating per-site metrics
site_names = ['cam', 'hcp', 'ixi']

In [61]:
# Create pandas dataframes with header names to save out the overall and per-site model evaluation metrics
blr_metrics = pd.DataFrame(columns = ['ROI', 'MSLL', 'EV', 'SMSE', 'RMSE', 'Rho'])
blr_site_metrics = pd.DataFrame(columns = ['ROI', 'site', 'y_mean', 'y_var', 'yhat_mean', 'yhat_var', 'MSLL', 'EV', 'SMSE', 'RMSE', 'Rho'])

In [None]:
# This will loop through each ROI in the list roi_ids (that was set a few rows above)
# and run a normative model using Bayesian Linear Regression, then evaluate the normative model in the test set

for roi in roi_ids: 
    print('Running ROI:', roi)
    roi_dir = os.path.join(data_dir, roi)
    os.chdir(roi_dir)
    
    # set output dir 
    out_name = 'blr'
    os.makedirs(os.path.join(roi_dir,out_name), exist_ok=True)
    
    # load train & test covariate data matrices
    X_tr = np.loadtxt(os.path.join(roi_dir, 'cov_tr.txt'))
    X_te = np.loadtxt(os.path.join(roi_dir, 'cov_te.txt'))

    # add intercept column 
    X_tr = np.concatenate((X_tr, np.ones((X_tr.shape[0],1))), axis=1)
    X_te = np.concatenate((X_te, np.ones((X_te.shape[0],1))), axis=1)
    # save
    np.savetxt(os.path.join(roi_dir, 'cov_int_tr.txt'), X_tr)
    np.savetxt(os.path.join(roi_dir, 'cov_int_te.txt'), X_te)
    
    # create Bspline basis set 
    Phi = np.array([B(i) for i in X_tr[:,2]])
    Phis = np.array([B(i) for i in X_te[:,2]])
    X_tr = np.concatenate((X_tr, Phi), axis=1)
    X_te = np.concatenate((X_te, Phis), axis=1)
    # save
    np.savetxt(os.path.join(roi_dir, 'cov_bspline_tr.txt'), X_tr)
    np.savetxt(os.path.join(roi_dir, 'cov_bspline_te.txt'), X_te)
    
    # configure the covariates to use
    cov_file_tr = os.path.join(roi_dir, 'cov_') + cov_type + '_tr.txt'
    cov_file_te = os.path.join(roi_dir, 'cov_') + cov_type + '_te.txt'
    
    # load train & test response files
    resp_file_tr = os.path.join(roi_dir, 'resp_tr.txt')
    resp_file_te = os.path.join(roi_dir, 'resp_te.txt') 
    
    # run a basic model
    estimate(cov_file_tr, resp_file_tr, testresp=resp_file_te, testcov=cov_file_te, alg='blr', optimizer = 'powell', savemodel=False, standardize = False, warp=warp)
    
    # load training data (required for MSLL)
    y_tr = np.loadtxt(resp_file_tr)
    y_tr = y_tr[:, np.newaxis]
    
    # load test data, compute metrics on whole test set, save to pandas df 
    X_te = np.loadtxt(cov_file_te)
    y_te = np.loadtxt(resp_file_te)
    y_te = y_te[:, np.newaxis] 
    yhat_te = np.loadtxt(os.path.join(roi_dir, 'yhat_estimate.txt'))
    s2_te = np.loadtxt(os.path.join(roi_dir, 'ys2_estimate.txt'))
    yhat_te = yhat_te[:, np.newaxis]
    s2_te = s2_te[:, np.newaxis]
    metrics_te = evaluate(y_te, yhat_te, s2_te)
    y_mean_te = np.array([[np.mean(y_te)]])
    y_var_te = np.array([[np.var(y_te)]])
    MSLL_te = compute_MSLL(y_te, yhat_te, s2_te, y_mean_te, y_var_te)
    blr_metrics.loc[len(blr_metrics)] = [roi, MSLL_te[0],metrics_te['EXPV'][0],metrics_te['SMSE'][0],metrics_te['RMSE'][0],metrics_te['Rho'][0]]
    
    # compute metrics per site in test set, save to pandas df
    for num, site in enumerate(sites):
        yhat_te_site = yhat_te[site]
        y_te_site = y_te[site]
        s2_te_site = s2_te[site]
        metrics_te_site = evaluate(y_te_site, yhat_te_site, s2_te_site)
        y_mean_te_site = np.array([[np.mean(y_te_site)]])
        y_var_te_site = np.array([[np.var(y_te_site)]])
        yhat_mean_te_site = np.array([[np.mean(yhat_te_site)]])
        yhat_var_te_site = np.array([[np.var(yhat_te_site)]])
        MSLL_te_site = compute_MSLL(y_te_site, yhat_te_site, s2_te_site, y_mean_te_site, y_var_te_site)
        site_name = site_names[num]
        blr_site_metrics.loc[len(blr_site_metrics)] = [roi,site_names[num],y_mean_te_site[0],y_var_te_site[0],yhat_mean_te_site[0],yhat_var_te_site[0],MSLL_te_site[0],metrics_te_site['EXPV'][0],metrics_te_site['SMSE'][0],metrics_te_site['RMSE'][0],metrics_te_site['Rho'][0]]

In [63]:
os.chdir(data_dir)

/Users/saigerutherford/Desktop/lifespan/tutorial/ROI_models


In [64]:
# Save per site test set metrics variable to CSV file
blr_site_metrics.to_csv('blr_site_metrics.csv', index=False, index_label=None)

In [65]:
# Save overall test set metrics to CSV file
blr_metrics.to_csv('blr_metrics.csv', index=False, index_label=None)

# Step 6: Interpreting model performance

Output evaluation metrics definitions: 
* yhat - predictive mean
* ys2 - predictive variance
* nm - normative model
* Z - deviance scores
* Rho - Pearson correlation between true and predicted responses
* pRho - parametric p-value for this correlation
* RMSE - root mean squared error between true/predicted responses
* SMSE - standardised mean squared error
* EV - explained variance
* MSLL - mean square log loss
    * (Page 23) http://www.gaussianprocess.org/gpml/chapters/RW2.pdf