In [1]:
# load global and regional structural data
import pandas as pd

# my data file path
data = pd.read_csv('/data/sliu/updated_ukbb2/raw_data/matched_ukbb.csv')

# the items of global structural measures
total_items = pd.read_csv('/data/sliu/updated_ukbb2/raw_data/total_items.csv')
s1 = ['karin_IDs']
for i in range(total_items.shape[0]):
    s1.append(str(total_items.iloc[i,0])+'-2.0')
    
# the items of regional structural measures
regional_items = pd.read_csv('/data/sliu/updated_ukbb2/raw_data/regional_items.csv')
s2 = ['karin_IDs']
for i in range(regional_items.shape[0]):
    s2.append(str(regional_items.iloc[i,0])+'-2.0')

In [2]:
# total brain measures (total CSA, average CT and ICV)
T_data = data[s1]
T_data.dropna(axis=0,how='any',inplace=True)

# regional brain measures (CSA, CT based on DKT atlas)
R_data = data[s2]
R_data.dropna(axis=0,how='any',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [3]:
# load covariates (age, sex, head positions, the top 25 principal components)
temp = pd.read_csv('/data/sliu/updated_ukbb/added_used_variables.csv')
s = ['eid','25756-2.0','25757-2.0','25758-2.0']
QC1 = temp[s]
temp2 = pd.read_csv('/data/sliu/muti_PRSs/ukb30545.sample_QC.csv')
s2 = ['eid','age_at_reqruitment','genetic_sex']
QC2 = temp2[s2]
PCs = pd.read_table('/data/sliu/muti_PRSs/UKB.HM3.EUR.100PCs.txt','\t')

# load Polygenic scores for 14 mental health and cognitive traits
PRS_data = pd.read_csv('/data/sliu/updated_ukbb/ukb_PRSs.csv')

  


In [4]:
# matching subject IDs
T_data.set_index('karin_IDs', inplace=True)
QC1.set_index('eid',inplace=True)
QC2.set_index('eid',inplace=True)
PCs.set_index('IID',inplace=True)
PRS_data.set_index('FID',inplace=True)
l = list(set(T_data.index) & set(QC1.index) & set(QC2.index) & set(PCs.index) & set(PRS_data.index))
final_Ts = T_data.loc[l]
final_QC1 = QC1.loc[l]
final_QC2 = QC2.loc[l]
final_PCs = PCs.loc[l]
final_PRSs = PRS_data.loc[l]

final_Ts.reset_index(inplace=True)
final_QC1.reset_index(inplace=True)
final_QC2.reset_index(inplace=True)
final_PCs.reset_index(inplace=True)
final_PRSs.reset_index(inplace=True)

In [5]:
# merge covariates together 
import numpy as np
PCs_25 = final_PCs.iloc[:,2:27].values
age = final_QC2.iloc[:,1:2].values
sex = final_QC2.iloc[:,2:3].values
postions = final_QC1.iloc[:,1:4].values
sex = sex + 1
co = np.hstack((PCs_25,age,sex,age*age,age*sex,age*age*sex,postions))

In [7]:
X = final_PRSs.iloc[:,1:].values

In [8]:
TA = (final_Ts.iloc[:,1].values + final_Ts.iloc[:,2].values)/2
AT = (final_Ts.iloc[:,3].values + final_Ts.iloc[:,4].values)/2
ICV = final_Ts.iloc[:,5].values

In [17]:
# function used for regressing out the effects of covariates 
from sklearn.preprocessing import StandardScaler

def regression_covariant(covariant_matrix, y, standard_scale=False):
    a = np.hstack((covariant_matrix,np.ones((covariant_matrix.shape[0], 1))))
    w = np.linalg.lstsq(a,y,rcond=None)[0]

    residual = y - covariant_matrix.dot(w[:-1])
    residual = residual.astype('float64')

    if standard_scale:
        residual = StandardScaler().fit_transform(residual.reshape(-1,1)).flatten()

    return residual, w

In [28]:
# Pearson's correlation analysis between PRSs and global structural measures
from scipy.stats import pearsonr
size1 = X.shape[1]
R = np.empty([size1,3])
P = np.empty([size1,3])
[r_TA,w] = regression_covariant(co,TA,standard_scale=True)
[r_AT,w] = regression_covariant(co,AT,standard_scale=True)
[r_ICV,w] = regression_covariant(co,ICV,standard_scale=True)
for i in range(size1):
    x = X[:,i]
    [rx,w1] = regression_covariant(co,x,standard_scale=True)
    r1,p1 = pearsonr(rx, r_TA)
    r2,p2 = pearsonr(rx, r_AT)
    r3,p3 = pearsonr(rx, r_ICV)
    R[i,0] = r1
    R[i,1] = r2
    R[i,2] = r3
    P[i,0] = p1
    P[i,1] = p2
    P[i,2] = p3
    if i == 7 or i == 8 or i == 11:
        R[i,:] = -R[i,:]

In [33]:
# FDR multiple comparison correction
from statsmodels.stats import multitest
size = P.shape
temp_p = P.flatten()
Ps = multitest.multipletests(temp_p,alpha=0.05,method='fdr_bh')
P_corrected = Ps[1].reshape(size)

In [35]:
# output results
re_R = pd.DataFrame(data=R)
re_R.to_csv('/data/sliu/updated_ukbb2/total_results/re_R.csv',index=False)
re_P = pd.DataFrame(data=P)
re_P.to_csv('/data/sliu/updated_ukbb2/total_results/re_P.csv',index=False)
re_P_corrected = pd.DataFrame(data=P_corrected)
re_P_corrected.to_csv('/data/sliu/updated_ukbb2/total_results/re_P_corrected.csv',index=False)