In [1]:
import os
import h5py
import numpy as np
import pandas as pd
from scipy.io import loadmat
from scipy.stats import pearsonr

### get the HCP demographics

In [2]:
HCP_u = pd.read_csv('../data/HCP_solar/HCP_s1200_unrestricted.csv', 
                    index_col = 0)

HCP_u.index = HCP_u.index.map(str)

HCP_r = pd.read_csv('../data/HCP_solar/HCP_s1200_restricted.csv', 
                    index_col = 0)

HCP_r.index = HCP_r.index.map(str)


In [3]:
len(HCP_u), len(HCP_r)

(1206, 1206)

### combine S900 Gradients with HCP demogr.

In [5]:
# get HCP - S900 subject list        
subjlist = '../data/subjectListS900_QC_gr.txt'
f = open(subjlist); mylist = f.read().split("\n"); f.close() 
subjlist = joinedlist = mylist[:-1]

len(subjlist)

709

### G1, LSUB

In [6]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lsub = 1024 ## because subiculum has 1024 points 
node_str = []
for i in range(1, tot_node_num_lsub+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [7]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_LSUB = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G1_LSUB.h5'), 'r')
    g1_LSUB = np.array(gfile_LSUB[subjID])  
    gfile_LSUB.close()

    df.iloc[iC][node_str] = g1_LSUB


In [11]:
#df

In [9]:
df = df.dropna()
len(df)

709

In [10]:
df.to_csv('../solar/msm50_G1_LSUB/G1_LSUB.csv')

### G1, LCA

In [11]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lca = 2048 ## because ca has 2048 points 
node_str = []
for i in range(1, tot_node_num_lca+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str
len(mycols)

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
2048


In [12]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_LCA = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G1_LCA.h5'), 'r')
    g1_LCA = np.array(gfile_LCA[subjID])  
    gfile_LCA.close()
    
    df.iloc[iC][node_str] = g1_LCA


In [13]:
df = df.dropna()
len(df)

709

In [14]:
df.to_csv('../solar/msm50_G1_LCA/G1_LCA.csv')

### G1, LDG

In [15]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_ldg = 1024 ## because dg has 1024 points 
node_str = []
for i in range(1, tot_node_num_ldg+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 
len(mycols)

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [16]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_LDG = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G1_LDG.h5'), 'r')
    g1_LDG = np.array(gfile_LDG[subjID])  
    gfile_LDG.close()
    
    df.iloc[iC][node_str] = g1_LDG

    


In [17]:
df = df.dropna()
len(df)

709

In [18]:
df.to_csv('../solar/msm50_G1_LDG/G1_LDG.csv')

### G1, RSUB

In [19]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lsub = 1024 ## because subiculum has 1024 points 
node_str = []
for i in range(1, tot_node_num_lsub+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str


df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [20]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_RSUB = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G1_RSUB.h5'), 'r')
    g1_RSUB = np.array(gfile_RSUB[subjID])  
    gfile_RSUB.close()

    df.iloc[iC][node_str] = g1_RSUB


In [21]:
df = df.dropna()
len(df)

709

In [22]:
df.to_csv('../solar/msm50_G1_RSUB/G1_RSUB.csv')

### G1, RCA

In [23]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lca = 2048 ## because ca has 2048 points 
node_str = []
for i in range(1, tot_node_num_lca+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 
len(mycols)

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
2048


In [24]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_RCA = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G1_RCA.h5'), 'r')
    g1_RCA = np.array(gfile_RCA[subjID])  
    gfile_RCA.close()
    
    df.iloc[iC][node_str] = g1_RCA


In [25]:
df = df.dropna()
len(df)

709

In [26]:
df.to_csv('../solar/msm50_G1_RCA/G1_RCA.csv')

### G1, RDG

In [27]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_ldg = 1024 ## because dg has 1024 points 
node_str = []
for i in range(1, tot_node_num_ldg+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 
len(mycols)

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [28]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_RDG = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G1_RDG.h5'), 'r')
    g1_RDG = np.array(gfile_RDG[subjID])  
    gfile_RDG.close()
    
    df.iloc[iC][node_str] = g1_RDG



In [29]:
df = df.dropna()
len(df)

709

In [30]:
df.to_csv('../solar/msm50_G1_RDG/G1_RDG.csv')

### G2, LSUB

In [31]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lsub = 1024 ## because subiculum has 1024 points 
node_str = []
for i in range(1, tot_node_num_lsub+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [32]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_LSUB = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G2_LSUB.h5'), 'r')
    g2_LSUB = np.array(gfile_LSUB[subjID])  
    gfile_LSUB.close()

    df.iloc[iC][node_str] = g2_LSUB


In [33]:
df = df.dropna()
len(df)

709

In [34]:
df.to_csv('../solar/msm50_G2_LSUB/G2_LSUB.csv')

### G2, LCA

In [37]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lca = 2048 ## because ca has 2048 points 
node_str = []
for i in range(1, tot_node_num_lca+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
2048


In [38]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_LCA = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G2_LCA.h5'), 'r')
    g2_LCA = np.array(gfile_LCA[subjID])  
    gfile_LCA.close()
    
    df.iloc[iC][node_str] = g2_LCA


In [39]:
df = df.dropna()
len(df)

709

In [40]:
df.to_csv('../solar/msm50_G2_LCA/G2_LCA.csv')

### G2, LDG

In [41]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_ldg = 1024 ## because dg has 1024 points 
node_str = []
for i in range(1, tot_node_num_ldg+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 
len(mycols)

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [42]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_LDG = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G2_LDG.h5'), 'r')
    g2_LDG = np.array(gfile_LDG[subjID])  
    gfile_LDG.close()
    
    df.iloc[iC][node_str] = g2_LDG

    


In [43]:
df = df.dropna()
len(df)

709

In [44]:
df.to_csv('../solar/msm50_G2_LDG/G2_LDG.csv')

### G2, RSUB

In [46]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lsub = 1024 ## because subiculum has 1024 points 
node_str = []
for i in range(1, tot_node_num_lsub+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [47]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_RSUB = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G2_RSUB.h5'), 'r')
    g2_RSUB = np.array(gfile_RSUB[subjID])  
    gfile_RSUB.close()

    df.iloc[iC][node_str] = g2_RSUB


In [48]:
df = df.dropna()
len(df)

709

In [49]:
df.to_csv('../solar/msm50_G2_RSUB/G2_RSUB.csv')

### G2, RCA

In [50]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_lca = 2048 ## because ca has 2048 points 
node_str = []
for i in range(1, tot_node_num_lca+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 
len(mycols)

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
2048


In [51]:
ddir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_RCA = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G2_RCA.h5'), 'r')
    g2_RCA = np.array(gfile_RCA[subjID])  
    gfile_RCA.close()
    
    df.iloc[iC][node_str] = g2_RCA


In [52]:
df = df.dropna()
len(df)

709

In [53]:
df.to_csv('../solar/msm50_G2_RCA/G2_RCA.csv')

### G2, RDG

In [54]:
mysubjects = []

for subj in subjlist:
    mysubjects.append(subj)
print(len(mysubjects))

tot_node_num_ldg = 1024 ## because dg has 1024 points 
node_str = []
for i in range(1, tot_node_num_ldg+1):
    node_str.append('node_'+ str(i))
print(len(node_str))
node_str[0], '....', node_str[-1]

mycols = ['age', 'sex'] + node_str 

df = pd.DataFrame(index = mysubjects,
                 columns = mycols)
df.index.name = 'id'

709
1024


In [55]:
gdir = '../data/tout_hippoc_grad_flipped_msm50'

for subjID in mysubjects:
    
    iA = HCP_r.index.get_loc(subjID)
    iB = HCP_u.index.get_loc(subjID)
    iC = df.index.get_loc(subjID)

    df.iloc[iC]['age'] = HCP_r.iloc[iA]['Age_in_Yrs']
    df.iloc[iC]['sex'] = HCP_u.iloc[iB]['Gender']

    gfile_RDG = h5py.File(os.path.join(gdir, 'HCP_' + subjID + '_G2_RDG.h5'), 'r')
    g2_RDG = np.array(gfile_RDG[subjID])  
    gfile_RDG.close()
    
    df.iloc[iC][node_str] = g2_RDG


In [56]:
df = df.dropna()
len(df)

709

In [57]:
df.to_csv('../solar/msm50_G2_RDG/G2_RDG.csv')