In [3]:
import sys
import copy
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from scipy.linalg import block_diag, inv
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GroupKFold
from scipy.stats import t

In [4]:
# input info
ROI_name_list = ['L_precentral_thickavg', 'R_precentral_thickavg']
covar_dynamic_name_list = ['Age', 'Sex'] 
# recode sex info!
covar_stable_name_list = ['APOE4'] 
meausure_time_list = ['sc', '12mo', '24mo']
#n_measure_point = len(meausure_time_list)
#n_timepoint = 3
#slice_length = 3

#genetic information
grm_path = None

# Input files
# pycaret package ~
path_name = '/Users/aliceyang/ADNI1plus2_Longitidinal_Cortical_Subcortical/Cortical_Thickness_SurfArea_Final'
pheno_file_name = '/ADNI1plus2_cortical_CN_final_sort.csv'
#pheno_file_name = '/ADNI1plus2_cortical_MCI_final_sort.csv'
#pheno_file_name = '/ADNI1plus2_cortical_Dementia_final_sort.csv'
#pheno_file_name = '/ADNI1plus2_cortical_MCI2Dementia_final_sort.csv'
#pheno_file_name = '/ADNI1plus2_cortical_MCIDementia2_final_sort.csv'
#pheno_file_name = '/ADNI1plus2_cortical_Converter_final_sort.csv'
pheno_merge_name = path_name + pheno_file_name
#diagnosis_name = 'sMCI' # diagnosis name may be extracted from file name

# output path name
output_path_name = '/Users/aliceyang/result'
output_file_name = 'res'
file_type_name = '.csv'

# the time point/spatial location ifo should not be coded in the subject ID!
# cohort info
subject_ID_name = 'SubjID'
subject_ID_slice_length = 3
stable_parameter = 0.8

# Read files and load column information to the corresponding functions

In [5]:
def load_info_from_file(filename, ROI_name_list, covar_dynamic_name_list, meausure_time_list):
    """
    
    read input csv file containing all phenotype/genotype/covariates such as age and sex.
    
    Parameter(s)
    ----------
    filename: str, csv filename.
    
    Return(s): 
    ----------
    df: dataframe of the csv file.
    
    n: total number of measurements for all subjects.
    
    """
    #print(filename)
    #print(ROI_name_list)
    #print(covar_dynamic_name_list)
    #print(meausure_time_list)
    
    if os.path.exists(filename):
        df = pd.read_csv(filename)
    else:
        print('file not found!')
        sys.exit()
    # n = df.shape[0]
    
    dict_roi_name = {}
    for roi in ROI_name_list:
        dict_roi_name[roi] = []
        for tp in meausure_time_list:
            col_name = roi + '_' + tp
            if col_name not in df.columns:
                print(col_name + 'does not exist!')
                sys.exit()
            dict_roi_name[roi].append(col_name)
    #print(dict_roi_name)
    
    dict_dynamic_cov_name = {}
    for cov in covar_dynamic_name_list:
        dict_dynamic_cov_name[cov] = []
        for tp in meausure_time_list:
            col_name = cov + '_' + tp
            if col_name not in df.columns:
                print(col_name + 'does not exist!')
                sys.exit()
            dict_dynamic_cov_name[cov].append(col_name)
    #print(dict_dynamic_cov_name)
    return df, dict_roi_name, dict_dynamic_cov_name

In [6]:
def mapping_matrix_T(measurement):
    """
    
    get mapping matrix to map cross sectional varibles (covariates/ranfom effect correlation) to longitudinal information.
    
    Parameter(s)
    ----------    
    measurement: list, each entry represents the number of measurements for each subject.
    
    Return(s): 
    ----------
    T: numpy array, the mapping matrix T, T=blkdiag{1_n1 , ···,1_nm}. 
    
    """
    T = np.expand_dims(np.repeat(1, measurement[0]), axis=1)
    for i in range(1, np.shape(measurement)[0]): 
        T_pre = T
        T = block_diag(T_pre, np.expand_dims(np.repeat(1, measurement[i]), axis = 1))
    return T

In [7]:
def cross_sectional_longit_mapping_function(matrix_T, correlation_matrix):
    """
    
    map function for cross-sectional covariate information or random effect correlation information to longitudinal.
    T=blkdiag{1_n1 , ···,1_nm} 
    
    Parameter(s)
    ----------  
    correlation_matrix: numpy array N x N, the correlation matrix specified for the normal distribution. N number of subjects
    
    matrix_T: numpy array ? x ?, the function to map cross sectional to longitudinal.
    

    
    Return(s): 
    ----------
    longit_correlation: longitudinal correlation matrix mapped from cross sectional 
    
    """
    # use @ 
    T_correlation = np.matmul(matrix_T, correlation_matrix)
    T_correlation_T_transpose = np.matmul(T_correlation, np.transpose(matrix_T))
    N_total_measurement = np.shape(T_correlation_T_transpose)[0]
    longit_correlation = T_correlation_T_transpose
    #longit_correlation_1d = T_correlation_T_transpose.flatten('F')
    return longit_correlation

In [8]:
def load_grm_info_from_file(n_subject, filename):
    """
       ! optional 
    
    """
    if filename == None:
        grm = np.identity(n_subject)
    else:
        grm = pd.read_csv(filename, header = None)
    return grm

In [9]:
def load_basic_subject_measurement_info_from_file(n_subject, filename, ROI_name_list, ROI_index = None):
    # how to caluculate the number of measurements for each subject?
    if filename == None:
        measurement = df.columns[df.columns.str.startswith(ROI_name_list[ROI_index])]
    else:
        measurement = pd.read_csv(filename, header = None)

# Generate random effect correlation matrix information and extract covariance information

In [10]:
def load_random_effect_info_from_file(grm, measurement, subject_level, T):

    """
    
    get the random effect covariance matrix for each of the ranfom effect (genetic relationship/temporal or spatial effect/site effect/measurement error)
    
    Parameter(s)
    ----------   
    grm: numpy array, input Genetic Relationship Matrix (GRM) for subject-subject correlation.
    
    measurement: list, each entry represents the number of measurements for any subject.
    
    subject_level: numpy array, 
    
    T: numpy array, the mapping matrix T, T=blkdiag{1_n1 , ···,1_nm}.
    
    Return(s)
    ----------  
    longit_grm: numpy array, one-dimensional array for longitudinal Genetic Relationship Matrix (GRM).
    
    longit_C1: numpy array,  one-dimensional array for longitudinal correlation matrix corresponding to time intervals between two closest measurements. 
    
    longit_C2: numpy array,  one-dimensional array for longitudinal correlation matrix corresponding to time intervals between two closest measurements.
    
    longit_block_diag_matrix: numpy array,  one-dimensional array for longitudinal block diagnoal matrix 
    
    longit_error: numpy array, one-dimensional array for indepedent measurement erors.
    
    
    """
    
# Create longitudinal GRM 1d info
    grm_arr = np.array(grm)
    longit_grm = cross_sectional_longit_mapping_function(T, grm_arr)
    
# Create covariate matrix input for temporal or spatialy correlated random effect using time intervals or distance functions
# hypothetically  
    if measurement is not None:# Create T always 3 time points
        longit_C1 = np.zeros((measurement[0], measurement[0]))
        longit_C2 = np.zeros((measurement[0], measurement[0]))
        for i in range(measurement[0]):
            for j in range(measurement[0]):
                    if abs(i - j) == 1:
                        longit_C1[i, j] = 1 # rho**abs(i - j) 
                    if abs(i - j) == 2:
                        longit_C2[i, j] = 1 # rho**abs(i - j)
    
        for s in range(1, len(measurement)):
            longit_C1_temp = np.zeros((measurement[s], measurement[s]))
            longit_C2_temp = np.zeros((measurement[s], measurement[s]))
            for i in range(measurement[s]):
                for j in range(measurement[s]):
                    if abs(i - j) == 1:
                        longit_C1_temp[i, j] = 1 # rho**abs(i - j) 
                    if abs(i - j) == 2:
                        longit_C2_temp[i, j] = 1 # rho**abs(i - j)
            # C1 definition
            longit_C1_pre = longit_C1
            longit_C1 = block_diag(longit_C1_pre, longit_C1_temp)
            # C2 definition
            longit_C2_pre = longit_C2
            longit_C2 = block_diag(longit_C2_pre, longit_C2_temp)
        
# Create longitudinal site 1d info using subject level
    subject_level_dummy = pd.get_dummies(subject_level)
    subject_level_cnt = subject_level_dummy.sum(axis=0)
    block_diag_matrix = np.identity(subject_level_cnt[0])
    for i in range(1, np.shape(subject_level_cnt)[0]):
            block_diag_matrix_prev = block_diag_matrix
            block_diag_matrix = block_diag(block_diag_matrix_prev, np.identity(subject_level_cnt[i]))
    longit_block_diag_matrix = cross_sectional_longit_mapping_function(T, block_diag_matrix)
    
# create measurement error 1d info
    longit_error = np.identity(np.sum(measurement))
    return longit_grm, longit_C1, longit_C2, longit_block_diag_matrix, longit_error

In [11]:
def load_covar_info_from_file(df, covar_stable_name_list, dict_dynamic_cov_name, measurement, T):
    
    intercept = np.expand_dims(np.ones(np.sum(measurement)), axis=1)
    if covar_stable_name_list is not None:
        covar_stable_matrix = np.zeros((len(measurement),len(covar_stable_name_list)))
        i = 0
        for cov in covar_stable_name_list:
            cov_value = df[cov]
            print(cov)
            covar_stable_matrix[:, i] = cov_value
            i = i + 1
        covar_stable_matrix_longit = np.matmul(T, covar_stable_matrix)
        if dict_dynamic_cov_name is None:
            covar_stable_matrix_longit = np.concatenate((intercept,covar_stable_matrix_longit), axis = 1) 
            return covar_stable_matrix_longit
    
    if dict_dynamic_cov_name is not None:
        covar_dynamic_matrix_longit = np.zeros((sum(measurement), len(dict_dynamic_cov_name)))
        j = 0
        for cov in dict_dynamic_cov_name.keys():
            cov_longit_value = []
            for cov_longit in dict_dynamic_cov_name[cov]:
                cov_longit_value.append(df[cov_longit])
            cov_longit_value = np.array(cov_longit_value).flatten('F')
            covar_dynamic_matrix_longit[:, j] = cov_longit_value
            j = j + 1
        if covar_stable_name_list is None:
            covar_dynamic_matrix_longit = np.concatenate((intercept,covar_dynamic_matrix_longit), axis = 1)
            return covar_dynamic_matrix_longit
        
        covar_matrix_longit = np.concatenate((intercept, covar_dynamic_matrix_longit, covar_stable_matrix_longit), axis = 1)
        return covar_matrix_longit

# Autoregressive Mixed Model parameter estimation: 2-step method
step 1: project out covariance matrix X including age, sex, SNP encoding.

step 2: run support vector regression to estimate random effect components. 

In [12]:
# 1st step - Create projection matrix to regress out covariance matrix X

def project_covariate_func(X):
    """
    
    get the covariates matrix when the users provide the name of coavariates needed.
    
    Parameter(s)
    ----------   
    X: np.array, longittudinal covariate matrix inluding intercept.
    

    Return(s)
    ----------   
    P: Projection matrix, so that X is projected to 0 matrix through P.
    
    """
    
    XX = np.dot(np.transpose(X), X)
    Z = np.dot(X, inv(XX))
    P = np.diag(np.ones(X.shape[0])) - np.dot(Z, np.transpose(X))
    return P

# Autoregressive Mixed Model (converter group) 

In [13]:
def ARLMM_late_converter(Y, X, grm, measurement, subject_level, T, stable_parameter):
    # notice for late converter group the correlation matrix used are the same as stable group 
    # but stable parameter is known
    
    """
    
    get the estimated parameters (betas/random effect variances) as the 2nd step of ARLMM.
    
    Parameter(s)
    ----------   
    Y: numpy array, one-dimensional longtidinal measures ordered by the time or spatial points.

    X: numpy array, longitudinal covariate matrix inluding intercept.
    
    grm: numpy array, cross sectional genetic relationship matrix.
    
    measurement: list, each entry represents the number of measurements for any subject.
    
    subject_level: list, ...
    
    T: numpy array, the mapping matrix T, T=blkdiag{1_n1 , ···,1_nm}.
    
    stable_parameter: numeric, 
    
    

    Return(s)
    ----------   
    rho_est: numeric value, correlation parameter for temporal/spatial correlation matrix (converter).
    
    var_g_est: numeric value, variance of genetic effect.
    
    var_t_est: numeric value, variance of imaging site effect.
    
    var_c_est: numeric value, variance of temporal/spatial correlated effect.
    
    var_e_est: numeric value, variance of meausurement error.
    
    h2_est: numeric value, proportion of genetic effect over total effect.
    
    beta_est: numpy array, beta values for covariance.
    
    """
    
    P = project_covariate_func(X)
    
    longit_grm, longit_C1, longit_C2, longit_block_diag_matrix, longit_error = load_random_effect_info_from_file(grm, measurement, subject_level, T)
    
    project_longit_grm_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_grm),[-1, 1], order = 'F')
    project_longit_C1_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_C1), [-1, 1], order = 'F')
    project_longit_C2_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_C2), [-1, 1], order = 'F')
    project_longit_block_diag_matrix_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_block_diag_matrix), [-1, 1], order = 'F')
    project_longit_error_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_error).flatten('F'), [-1, 1], order = 'F')
    Y_Y_transpose_1d = np.reshape(np.dot(Y, np.transpose(Y)), [-1, 1], order = 'F')
    
    project_longit_X_1d = np.concatenate((project_longit_grm_1d, project_longit_C1_1d, project_longit_C2_1d, project_longit_block_diag_matrix_1d, project_longit_error_1d), axis = 1)

    clf = SGDRegressor(tol = 1e-3, penalty = 'l2',loss = 'squared_epsilon_insensitive', fit_intercept= False)
    clf.fit(project_longit_X_1d, Y_Y_transpose_1d.ravel())
    
    sigma = clf.coef_
    rho_est = max(0.00000000001, min(sigma[2]/sigma[1], 1)) 
    var_t_est = max(0.5*sigma[1]/stable_parameter + 0.5*sigma[2]/(rho_est*stable_parameter), 0.00000000001) 
    var_g_est = max(sigma[0], 0.00000000001) 
    var_e_est = max(sigma[3] - var_t_est, 0.00000000001)
    var_c_est = max(sigma[4], 0.00000000001) 
    h2_est = var_g_est/(var_t_est + var_g_est + var_c_est + var_e_est)
    
    t_error_cov = var_t_est*(np.diag(np.ones(T.shape[0])) + stable_parameter*longit_C1 + rho_est*stable_parameter*longit_C2)
    error_cov = var_e_est*np.diag(np.ones(T.shape[0]))
    genetic_cov = var_g_est*longit_grm
    correlation_cov = var_c_est*longit_block_diag_matrix
    total_V = t_error_cov + error_cov + genetic_cov + correlation_cov
    
    Y_new = np.dot(inv(total_V), Y)
    repsonse_new = np.dot(np.transpose(X), Y_new)
    X_new = np.dot(inv(total_V), X)
    predictor_new = np.dot(np.transpose(X), X_new)
    beta_est = np.dot(inv(predictor_new), repsonse_new)
    
    return rho_est, var_g_est, var_t_est, var_c_est, var_e_est, h2_est, beta_est

In [17]:
def ARLMM_total_converter(Y, X, grm, measurement, subject_level, T, beta_late_convert):
    """
    
    get the estimated parameters (betas/random effect variances) as the 2nd step of ARLMM.
    
    Parameter(s)
    ----------   
    Y: numpy array, ...

    X: numpy array, ...
    
    grm: numpy array, ...
    
    measurement: list, ...
    
    subject_level: list, ...
    
    T: numpy array, the mapping matrix T, T=blkdiag{1_n1 , ···,1_nm}.
    
    DX_label: new label of diagnosis, for example 0 = 'early converter' and 1 = 'late converter'
    
    """
    N = np.shape(grm)[0]
        
    longit_grm, longit_C1, longit_C2, longit_block_diag_matrix, longit_error = load_random_effect_info_from_file(grm, measurement, subject_level, T)
    
    # Accurate Method 
    # t_error_cor = np.diag(np.ones(T.shape[0])) + stable_parameter0*longit_C1 + rho_est*stable_parameter0*longit_C2
    # t_error_cor = np.diag(np.ones(T.shape[0])) + rho_est*longit_C1 + rho_est*stable_parameter1*longit_C2            
    early_convert_index = np.where(DX_label == 0)[0]
    late_convert_index = np.where(DX_label == 1)[0]

    early_convert_X, early_convert_Y, early_convert_measurement = X[early_convert_index], Y[early_convert_index], measurement[early_convert_index]
    early_convert_grm = grm[early_convert_index, :]
    early_convert_grm = early_convert_grm[:, early_convert_index]
    early_convert_T = np.zeros((np.shape(early_convert_index)[0], np.shape(early_convert_index)[0]))
    early_convert_T = mapping_matrix_T(early_convert_measurement)
    longit_grm_early_convert, longit_C1_early_convert, longit_C2_early_convert, longit_block_diag_matrix_early_convert, longit_error_early_convert = load_random_effect_info_from_file(grm_early_convert, measurement_early_convert, subject_level_early_convert, T_early_convert)
    
    late_convert_X, late_convert_Y, late_convert_measurement = X[late_convert_index], Y[late_convert_index], measurement[late_convert_index]
    late_convert_grm = grm[late_convert_index, :]
    late_convert_grm = late_convert_grm[:, late_convert_index]
    late_convert_T = np.zeros((np.shape(late_convert_index)[0], np.shape(late_convert_index)[0]))
    late_convert_T = mapping_matrix_T(late_convert_measurement) 
    longit_grm_late_convert, longit_C1_late_convert, longit_C2_late_convert, longit_block_diag_matrix_late_convert, longit_error_late_convert = load_random_effect_info_from_file(grm_early_convert, measurement_early_convert, subject_level_early_convert, T_early_convert)
    
    # concate matrices from two groups, we have X and Y concatenated
    longit_grm = block_diag(early_convert_grm, late_convert_grm)
    T = block_diag(early_convert_T, late_convert_T)
    longit_error = block_diag(longit_error_early_convert, longit_error_late_convert)
    longit_block_diag_matrix_error = block_diag(longit_block_diag_matrix_early_convert, longit_block_diag_matrix_late_convert)
    
    late_convert_C = np.ones(late_convert_T.shape[0]) + beta_late_convert*longit_C1_late_convert + beta_late_convert*theta_est_late_convert*longit_C2_late_convert
    early_convert_C = np.ones(early_convert_T.shape[0]) + beta_late_convert*longit_C1_late_convert + beta_late_convert*theta_est_late_convert*longit_C2_late_convert
    C = block_diag(early_convert_C, late_convert_C)
    
    # projection matrix P
    P = project_covariate_func(X)
    
    # project out covariates X
    project_longit_grm_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_grm),[-1, 1], order = 'F')
    project_longit_C_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_C), [-1, 1], order = 'F')
    project_longit_block_diag_matrix_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_block_diag_matrix), [-1, 1], order = 'F')
    project_longit_error_1d = np.reshape(cross_sectional_longit_mapping_function(P, longit_error).flatten('F'), [-1, 1], order = 'F')
    Y_Y_transpose_1d = np.reshape(np.dot(Y, np.transpose(Y)), [-1, 1], order = 'F')
    
    project_longit_X_1d = np.concatenate((project_longit_grm_1d, project_longit_C_1d, project_longit_block_diag_matrix_1d, project_longit_error_1d), axis = 1)

    clf = SGDRegressor(tol = 1e-3, penalty = 'l2',loss = 'squared_epsilon_insensitive', fit_intercept= False)
    clf.fit(project_longit_X_1d, Y_Y_transpose_1d.ravel())

    sigma = clf.coef_
    var_t_est = max(sigma[2], 0.00000000001) 
    var_g_est = max(sigma[0], 0.00000000001) 
    var_e_est = max(sigma[3], 0.00000000001)
    var_c_est = max(sigma[1], 0.00000000001) 
    h2_est = var_g_est/(var_t_est + var_g_est + var_c_est + var_e_est)
    
    t_error_cov = var_t_est*longit_block_diag_matrix
    error_cov = var_e_est*longit_error
    genetic_cov = var_g_est*longit_grm
    correlation_cov = var_c_est*longit_block_diag_matrix
    total_V = t_error_cov + error_cov + genetic_cov + correlation_cov
    
    Y_new = np.dot(inv(total_V), Y)
    repsonse_new = np.dot(np.transpose(X), Y_new)
    X_new = np.dot(inv(total_V), X)
    predictor_new = np.dot(np.transpose(X), X_new)
    beta_est = np.dot(inv(predictor_new), repsonse_new)
    return var_g_est, var_t_est, var_c_est, var_e_est, h2_est, beta_est
    # get estimates of all variance components

In [18]:
pheno_file_name_late_convert = '/ADNI1plus2_cortical_MCI2Dementia_final_sort.csv'
pheno_merge_late_convert_name = path_name + pheno_file_name_late_convert

# brain measures (left entorhinal thickavg):
df_pheno_merge_late_convert, dict_roi_name, dict_dynamic_cov_name = load_info_from_file(pheno_merge_late_convert_name, ROI_name_list, covar_dynamic_name_list, meausure_time_list)

# Calcuate the number of participants
n_subject_late_convert = len(df_pheno_merge_late_convert)
grm_late_convert = load_grm_info_from_file(n_subject_late_convert, filename = None)

# load phenotype information and get average measures as the longitudinal measure to train/test
total_measure_late_convert = []
longit_measure_late_convert = []
for roi, longit_roi in dict_roi_name.items():
    total_measure_late_convert.append(df_pheno_merge_late_convert[longit_roi])

longit_measure_late_convert = np.mean(total_measure_late_convert, axis = 0)
longit_measure_late_convert = np.reshape(longit_measure_late_convert, [-1, 1], order = 'C')
#print(longit_measure_late_convert)

In [19]:
# the number of measurements per each subject
#subset_df = df[dict_roi_name[0]]
count_measure_late_convert = []
for roi, col_name in dict_roi_name.items():
    # left and right
    count_late_convert = df_pheno_merge_late_convert[col_name].notna().sum(axis = 1)
    count_measure_late_convert.append(count_late_convert)
measurement_late_convert = np.min(count_measure_late_convert, axis = 0)

# get mapping matrix T to map from cross sectional to longitudinal
T_late_convert = mapping_matrix_T(measurement_late_convert)
print(len(T_late_convert))

210


In [20]:
# do we need group info for cross validation from meausrement? genetic correlated?
k = 0
groups_late_convert = []
for cnt_meas in measurement_late_convert:
    for i in range(cnt_meas):
        groups_late_convert.append(k)
    k = k + 1
groups_late_convert = np.array(groups_late_convert)

# create subject level to represent scannner/location specific effects
SubjID_late_convert = df_pheno_merge_late_convert[subject_ID_name]
subject_level_late_convert = [ID[:subject_ID_slice_length] for ID in SubjID_late_convert]

# load ranfom effect information
longit_grm_late_convert, longit_C1_late_convert, longit_C2_late_convert, longit_block_diag_matrix_late_convert, longit_error_late_convert = load_random_effect_info_from_file(grm_late_convert, measurement_late_convert, subject_level_late_convert, T_late_convert)
#print(longit_C1)
#print(longit_C2)
#print(longit_block_diag_matrix)
longit_X_late_convert = load_covar_info_from_file(df_pheno_merge_late_convert, covar_stable_name_list, dict_dynamic_cov_name, measurement_late_convert, T_late_convert)

APOE4


In [21]:
rho_early_est, var_g_early_est, var_t_early_est, var_c_early_est, var_e_early_est, h2_early_est, beta_early_est = ARLMM_late_converter(longit_measure_late_convert, longit_X_late_convert, grm_late_convert, measurement_late_convert, subject_level_late_convert, T_late_convert, stable_parameter)
rho_early_est

1e-11

In [19]:
pheno_convert_file = '/ADNI1plus2_cortical_Converter_final_sort.csv'
pheno_merge_convert_name = path_name + pheno_convert_file
print(pheno_merge_convert_name)
df_pheno_merge_convert, dict_roi_name, dict_dynamic_cov_name = load_info_from_file(pheno_merge_convert_name, ROI_name_list, covar_dynamic_name_list, meausure_time_list)

/Users/aliceyang/ADNI1plus2_Longitidinal_Cortical_Subcortical/Cortical_Thickness_SurfArea_Final/ADNI1plus2_cortical_Converter_final_sort.csv


In [20]:
# Calcuate the number of participants
n_subject_convert = len(df_pheno_merge_convert)
grm_convert = load_grm_info_from_file(n_subject_convert, filename = None)

# load phenotype information and get average measures as the longitudinal measure to train/test
total_measure_convert = []
longit_measure_convert = []
for roi, longit_roi in dict_roi_name.items():
    total_measure_convert.append(df_pheno_merge_convert[longit_roi])

longit_measure_convert = np.mean(total_measure_convert, axis = 0)
longit_measure_convert = np.reshape(longit_measure_convert, [-1, 1], order = 'C')

In [22]:
np.shape(longit_measure_convert)

(366, 1)

In [23]:
# the number of measurements per each subject
#subset_df = df[dict_roi_name[0]]
count_measure_convert = []
for roi, col_name in dict_roi_name.items():
    # left and right
    count_convert = df_pheno_merge_convert[col_name].notna().sum(axis = 1)
    count_measure_convert.append(count_convert)
measurement_convert = np.min(count_measure_convert, axis = 0)

# get mapping matrix T to map from cross sectional to longitudinal
T_convert = mapping_matrix_T(measurement_convert)
print(len(T_convert))

366


In [24]:
# do we need group info for cross validation from meausrement? genetic correlated?
k = 0
groups_convert = []
for cnt_meas in measurement_convert:
    for i in range(cnt_meas):
        groups_convert.append(k)
    k = k + 1
groups_convert = np.array(groups_convert)

# create subject level to represent scannner/location specific effects
SubjID_convert = df_pheno_merge_convert[subject_ID_name]
subject_level_convert = [ID[:subject_ID_slice_length] for ID in SubjID_convert]

# load ranfom effect information
longit_grm_convert, longit_C1_convert, longit_C2_convert, longit_block_diag_matrix_convert, longit_error_convert = load_random_effect_info_from_file(grm_convert, measurement_convert, subject_level_convert, T_convert)
#print(longit_C1)
#print(longit_C2)
#print(longit_block_diag_matrix)
longit_X_convert = load_covar_info_from_file(df_pheno_merge_convert, covar_stable_name_list, dict_dynamic_cov_name, measurement_convert, T_convert)

APOE4


In [28]:
print(longit_X_convert)
np.mean(longit_X_convert[:, 1])
np.std(longit_X_convert[:, 1])

[[ 1.  65.3  0.   1. ]
 [ 1.  66.4  0.   1. ]
 [ 1.  67.5  0.   1. ]
 ...
 [ 1.  74.4  1.   2. ]
 [ 1.  75.5  1.   2. ]
 [ 1.  76.5  1.   2. ]]


7.462564264618788

In [35]:
(longit_X_convert[:, 3] == 0).sum()/3

41.0

In [None]:
41/56/24

# Linear Regression/MMHE results 

In [24]:
import statsmodels.api as sm

In [25]:
model = sm.OLS(longit_measure_convert, longit_X_convert)

In [26]:
results = model.fit()

In [27]:
results.params

array([ 3.16793109, -0.0133062 , -0.05034552, -0.01498589])

In [100]:
results.summary2().tables[1]['P>|t|']

const    1.122631e-95
x1       3.422980e-18
x2       2.275139e-02
x3       3.088727e-01
Name: P>|t|, dtype: float64

In [28]:
from sklearn.model_selection import KFold

In [29]:
# Define the number of folds for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [30]:
# Initialize lists to store evaluation metrics
mse_train_scores = []
mae_train_scores = []
r2_train_scores = []

# Initialize lists to store evaluation metrics
mse_test_scores = []
mae_test_scores = []
r2_test_scores = []

In [32]:
for train_index, test_index in kf.split(longit_X_convert):
    X_train, X_test = longit_X_convert[train_index], longit_X_convert[test_index]
    y_train, y_test = longit_measure_convert[train_index], longit_measure_convert[test_index]
    
    # Fit the model
    model = sm.OLS(y_train, X_train)
    results = model.fit()
    
    # Predict on the test set
    y_pred = results.predict(X_test)
    y_train_pred = results.predict(X_train)
    # Calculate evaluation metrics
    mse_test_scores.append(mean_squared_error(y_test, y_pred))
    mae_test_scores.append(mean_absolute_error(y_test, y_pred))
    r2_test_scores.append(r2_score(y_test, y_pred))
    
    mse_train_scores.append(mean_squared_error(y_train, y_train_pred))
    mae_train_scores.append(mean_absolute_error(y_train, y_train_pred))
    r2_train_scores.append(r2_score(y_train, y_train_pred))

In [33]:
print(np.sqrt(np.mean(mse_train_scores)))
print(np.mean(mae_train_scores))
print(np.mean(r2_train_scores))

print(np.sqrt(np.mean(mse_test_scores)))
print(np.mean(mae_test_scores))
print(np.mean(r2_test_scores))

0.2011042562507692
0.16121302929603792
0.21720106998598201
0.20234420981601342
0.16181837675590705
0.19737593057135275
