In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyreadstat
import seaborn as sns
from scipy import stats

cmap = plt.cm.rainbow
import h5py

In [2]:
def format_df(file_path, cats):
    
    df_original = pd.read_csv(file_path, sep=",")  
    df_vars = df_original[cats].copy() #only take selected variables
    
    for cat_i in cats:
        
        if df_vars.dtypes[cat_i] == 'object':
            df_vars[cat_i] = df_vars[cat_i].astype('category')
    
        if df_vars.dtypes[cat_i] == 'category': #for categorical variables
        
            cat_vals = list(df_vars[cat_i].cat.categories)
            cat_n = len(df_vars[cat_i].cat.categories)
            cat_idx = list(np.arange(cat_n))
            df_vars[cat_i].replace(cat_vals, cat_idx, inplace=True) #replace with integers
             
    idx = list(df_vars.index.values) #the index values per subject
    
    return df_original, df_vars.astype('float64'), idx #return new dataframe (in float format), subject indices

In [3]:
def create_h5(filename, df_vars, idx, confs=False): #if confs is not given, it defaults to False
    
    df = df_vars.copy()
    
    if confs:
        if 'Alter' in confs: #if 'age' in confs:
            confs_de = list(map(lambda x: x.replace('Alter', 'age'), confs)) #confs_de = list(map(lambda x: x.replace('age', 'Alter'), confs))
            X_drop = confs.copy() #X_drop = confs_de.copy()
            X_drop.append('Response')          
            df_noconfs = df_vars.drop(columns=X_drop)
            #df_noconfs = df_vars.drop(columns=confs_de)
        else:
            X_drop = confs.copy()
            X_drop.append('Response')         
            df_noconfs = df_vars.drop(columns=X_drop) #make dataframe that doesn't include confs
            #df_noconfs = df_vars.drop(columns=confs)
    else:
        confs_de=False
            
    f = h5py.File(filename, "w")    
    
    if confs:
        f.create_dataset('X', data=df_noconfs) #dataset X is df without confs
        #f.attrs['X_col_names']= list(df_noconfs.columns) #X_col_names are respective column names
        
        f.attrs['confs']= [confs_de] #confs are the conf names
        for conf in confs_de: #for conf in confs:
            if conf == 'age': #!
                #df.rename(columns={"Alter": "age"}) #!
                f.create_dataset(conf, data=df.loc[idx, "Alter"])   
            else:
                f.create_dataset(conf, data=df.loc[idx, conf])
    else:
        df_noX = df.drop(columns = ['Response'])
        f.create_dataset('X', data=df_noX)
        #f.create_dataset('X', data=df)
        f.attrs['X_col_names']= list(df.columns)
    
    f.create_dataset('Response', data=df_vars.loc[idx, 'Response'])
    f.create_dataset('i', data=idx)
    f.attrs['labels']= ['Response']
            
    print(f.keys(), f.attrs.keys(), "confs:", confs_de)
        
    f.close()

### HDF5 file with age, sex

In [4]:
file_path = '/home/marijatochadse/1_data/EPOC/EPOC_fMRI_pat.csv'
cats = ['Alter', 'Geschlecht', 'Response']

In [5]:
df_original, df_vars, idx = format_df(file_path, cats)

In [6]:
df_vars

Unnamed: 0,Alter,Geschlecht,Response
0,55.0,0.0,1.0
1,48.0,0.0,1.0
2,24.0,1.0,0.0
3,33.0,0.0,1.0
4,35.0,1.0,1.0
...,...,...,...
75,23.0,1.0,1.0
76,31.0,0.0,1.0
77,20.0,1.0,1.0
78,36.0,0.0,1.0


### Create HDF5 file with MRI, and age and sex as confounds

In [107]:
fMRI_pat = pd.read_csv('/home/marijatochadse/3_output/EPOC/freesurfer_dem_pat/rsfMRI_80pat.csv', sep=",")

In [108]:
fMRI_pat.head()

Unnamed: 0,subject,0,1,2,3,4,5,6,7,8,...,30618,30619,30620,30621,30622,30623,30624,30625,30626,30627
0,epoc_p_1004,0.455885,0.416174,0.108076,0.356433,0.279444,0.305667,0.195758,0.226338,0.249908,...,0.719358,0.542248,0.495881,0.520116,0.814865,0.679056,0.762133,0.818526,0.810374,0.733528
1,epoc_p_1005,0.884573,0.775685,0.731658,0.515574,0.475172,0.606112,0.736526,0.563819,0.558976,...,0.699611,0.64235,0.459945,0.413568,0.87267,0.403599,0.410607,0.649702,0.645313,0.829218
2,epoc_p_1006,0.579738,0.796199,0.389679,0.724294,0.580416,0.721528,0.365477,0.485122,0.364495,...,0.699869,0.564032,0.784806,0.638208,0.931941,0.825529,0.795836,0.812432,0.746147,0.782977
3,epoc_p_1008,0.782747,0.542955,0.285548,0.106172,0.309814,0.634742,0.745536,0.28872,0.322136,...,0.669404,0.701875,0.542652,0.539693,0.774744,0.433234,0.705622,0.766249,0.709413,0.521571
4,epoc_p_1012,0.807958,0.125688,0.284293,0.369072,0.223351,0.596993,0.582179,0.485796,0.453636,...,0.752685,0.68437,0.66154,0.670104,0.795169,0.57812,0.804931,0.831305,0.827954,0.693077


In [109]:
fMRI_pat.shape

(80, 30629)

In [110]:
#df_vars_new = pd.concat([df_vars, aseg_dem_pat.iloc[:, 1:]], axis=1, ignore_index=True, sort=False)
df_vars_new = pd.concat([df_vars, fMRI_pat.iloc[:, 1:]], axis=1)

In [111]:
df_vars_new.shape

(80, 30631)

In [112]:
df_vars_new.dtypes

Alter         float64
Geschlecht    float64
Response      float64
0             float64
1             float64
               ...   
30623         float64
30624         float64
30625         float64
30626         float64
30627         float64
Length: 30631, dtype: object

In [113]:
df_vars_new.head()

Unnamed: 0,Alter,Geschlecht,Response,0,1,2,3,4,5,6,...,30618,30619,30620,30621,30622,30623,30624,30625,30626,30627
0,55.0,0.0,1.0,0.455885,0.416174,0.108076,0.356433,0.279444,0.305667,0.195758,...,0.719358,0.542248,0.495881,0.520116,0.814865,0.679056,0.762133,0.818526,0.810374,0.733528
1,48.0,0.0,1.0,0.884573,0.775685,0.731658,0.515574,0.475172,0.606112,0.736526,...,0.699611,0.64235,0.459945,0.413568,0.87267,0.403599,0.410607,0.649702,0.645313,0.829218
2,24.0,1.0,0.0,0.579738,0.796199,0.389679,0.724294,0.580416,0.721528,0.365477,...,0.699869,0.564032,0.784806,0.638208,0.931941,0.825529,0.795836,0.812432,0.746147,0.782977
3,33.0,0.0,1.0,0.782747,0.542955,0.285548,0.106172,0.309814,0.634742,0.745536,...,0.669404,0.701875,0.542652,0.539693,0.774744,0.433234,0.705622,0.766249,0.709413,0.521571
4,35.0,1.0,1.0,0.807958,0.125688,0.284293,0.369072,0.223351,0.596993,0.582179,...,0.752685,0.68437,0.66154,0.670104,0.795169,0.57812,0.804931,0.831305,0.827954,0.693077


In [91]:
df_vars_test = df_vars_new.iloc[:,0:4000]

In [92]:
df_vars_test.shape

(80, 4000)

### Create HDF5 file with MRI, and age and sex as confounds

In [119]:
confounds = ['Geschlecht', 'Alter']

In [122]:
create_h5('EPOC_fMRI_conf_age_sex.h5', df_vars_new, idx, confounds)

<KeysViewHDF5 ['Geschlecht', 'Response', 'X', 'age', 'i']> <KeysViewHDF5 ['confs', 'labels']> confs: ['Geschlecht', 'age']


In [123]:
# check created file
f3 = h5py.File('EPOC_fMRI_conf_age_sex.h5','r+')

In [124]:
f3.keys()

<KeysViewHDF5 ['Geschlecht', 'Response', 'X', 'age', 'i']>

In [125]:
f3.attrs.keys()

<KeysViewHDF5 ['confs', 'labels']>

In [126]:
f3.attrs.get('X_col_names')

In [127]:
f3.attrs.get('confs')

array([['Geschlecht', 'age']], dtype=object)

In [128]:
f3.attrs.get('labels')

array(['Response'], dtype=object)

In [129]:
pd.DataFrame(f3['X'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30618,30619,30620,30621,30622,30623,30624,30625,30626,30627
0,0.455885,0.416174,0.108076,0.356433,0.279444,0.305667,0.195758,0.226338,0.249908,0.340056,...,0.719358,0.542248,0.495881,0.520116,0.814865,0.679056,0.762133,0.818526,0.810374,0.733528
1,0.884573,0.775685,0.731658,0.515574,0.475172,0.606112,0.736526,0.563819,0.558976,0.863735,...,0.699611,0.642350,0.459945,0.413568,0.872670,0.403599,0.410607,0.649702,0.645313,0.829218
2,0.579738,0.796199,0.389679,0.724294,0.580416,0.721528,0.365477,0.485122,0.364495,0.900395,...,0.699869,0.564032,0.784806,0.638208,0.931941,0.825529,0.795836,0.812432,0.746147,0.782977
3,0.782747,0.542955,0.285548,0.106172,0.309814,0.634742,0.745536,0.288720,0.322136,0.089465,...,0.669404,0.701875,0.542652,0.539693,0.774744,0.433234,0.705622,0.766249,0.709413,0.521571
4,0.807958,0.125688,0.284293,0.369072,0.223351,0.596993,0.582179,0.485796,0.453636,0.831173,...,0.752685,0.684370,0.661540,0.670104,0.795169,0.578120,0.804931,0.831305,0.827954,0.693077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0.863422,0.225017,0.107149,0.019014,0.172295,0.755308,0.767687,0.828648,0.753948,-0.024335,...,0.811136,0.791975,0.793850,0.730304,0.844700,0.611390,0.676364,0.817382,0.831792,0.831044
76,0.922696,0.727202,0.764775,0.487774,0.579489,0.872651,0.861962,0.810562,0.818232,0.711275,...,0.870459,0.810386,0.857769,0.733266,0.920049,0.915807,0.840287,0.949767,0.932956,0.888280
77,0.627340,0.361998,0.179150,-0.165640,-0.088773,0.615432,0.585188,0.593212,0.428455,0.523412,...,0.609018,0.355680,0.181513,0.035656,0.633407,0.189717,0.258845,0.723052,0.758558,0.711074
78,0.221087,0.097547,0.128686,0.636401,0.422638,0.086130,0.108548,0.195151,0.112038,0.701835,...,0.764829,0.824695,0.813789,0.691009,0.901955,0.795324,0.783262,0.875481,0.826422,0.778487


In [130]:
pd.DataFrame(f3['age'])

Unnamed: 0,0
0,55.0
1,48.0
2,24.0
3,33.0
4,35.0
...,...
75,23.0
76,31.0
77,20.0
78,36.0


In [131]:
f3.close()