In [145]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyreadstat
import seaborn as sns
from scipy import stats

cmap = plt.cm.rainbow
import h5py

In [146]:
def format_df(file_path, cats):
    
    df_original = pd.read_csv(file_path, sep=",")  
    df_vars = df_original[cats].copy() #only take selected variables
    scale_info = []
    
    for cat_i in cats:
        
        #print(df_vars.dtypes[cat_i])
        if df_vars.dtypes[cat_i] == 'object':
            df_vars[cat_i] = df_vars[cat_i].astype('category')
            scale_info.append('categorical')
    
        if df_vars.dtypes[cat_i] == 'category': #for categorical variables
        
            cat_vals = list(df_vars[cat_i].cat.categories)
            cat_n = len(df_vars[cat_i].cat.categories)
            cat_idx = list(np.arange(cat_n))
            df_vars[cat_i].replace(cat_vals, cat_idx, inplace=True) #replace with integers
            scale_info.append('numerical')
             
    idx = list(df_vars.index.values) #the index values per subject
        
    return df_original, df_vars.astype('float64'), scale_info, idx #return new dataframe (in float format), subject indices

In [211]:
def create_h5(filename, df_vars, scale_info, confs_scale_info, idx, confs=False): #if confs is not given, it defaults to False
    
    df = df_vars.copy()
    
    if confs:
        if 'Alter' in confs: #if 'age' in confs:
            confs_de = list(map(lambda x: x.replace('Alter', 'age'), confs)) #confs_de = list(map(lambda x: x.replace('age', 'Alter'), confs))
            X_drop = confs.copy() #X_drop = confs_de.copy()
            X_drop.append('Response')          
            df_noconfs = df_vars.drop(columns=X_drop)
            #df_noconfs = df_vars.drop(columns=confs_de)
        else:
            X_drop = confs.copy()
            X_drop.append('Response')         
            df_noconfs = df_vars.drop(columns=X_drop) #make dataframe that doesn't include confs
            #df_noconfs = df_vars.drop(columns=confs)
    else:
        confs_de=False
            
    f = h5py.File(filename, "w")    
    
    if confs:
        f.create_dataset('X', data=df_noconfs) #dataset X is df without confs
        f.attrs['X_col_names']= list(df_noconfs.columns) #X_col_names are respective column names
        
        f.attrs['confs']= [confs_de] #confs are the conf names
        for conf in confs_de: #for conf in confs:
            if conf == 'age': #!
                #df.rename(columns={"Alter": "age"}) #!
                f.create_dataset(conf, data=df.loc[idx, "Alter"])   
            else:
                f.create_dataset(conf, data=df.loc[idx, conf])
                
        f.create_dataset('confs_scale_info', data=confs_scale_info)
        
    else:
        df_noX = df.drop(columns = ['Response'])
        f.create_dataset('X', data=df_noX)
        #f.create_dataset('X', data=df)
        f.attrs['X_col_names']= list(df.columns)
        
    f.create_dataset('scale_info', data=scale_info)
        
        
    
    f.create_dataset('Response', data=df_vars.loc[idx, 'Response'])
    f.attrs['labels']= ['Response']
    f.create_dataset('i', data=idx)

            
    print(f.keys(), f.attrs.keys(), "confs:", confs_de)
        
    f.close()

### HDF5 file with age, sex

In [148]:
file_path = '/home/marijatochadse/1_data/EPOC/EPOC_T1_pat.csv'
cats = ['Alter', 'Geschlecht', 'Response']

In [150]:
# format df into input format
df_original, df_vars, scale_info, idx = format_df(file_path, cats)
df_vars

Unnamed: 0,Alter,Geschlecht,Response
0,55.0,0.0,1.0
1,48.0,0.0,1.0
2,24.0,1.0,0.0
3,33.0,0.0,1.0
4,35.0,1.0,1.0
5,45.0,0.0,1.0
6,29.0,0.0,1.0
7,36.0,1.0,0.0
8,41.0,0.0,1.0
9,18.0,1.0,0.0


In [6]:
df_vars.dtypes

Alter         float64
Geschlecht    float64
Response      float64
dtype: object

In [7]:
df_vars['Geschlecht'].value_counts(dropna = False)

1.0    48
0.0    42
Name: Geschlecht, dtype: int64

In [8]:
# create HDF5 file
create_h5('EPOC_T1_age_sex.h5', df_vars, idx)

<KeysViewHDF5 ['Response', 'X', 'i']> <KeysViewHDF5 ['X_col_names', 'labels']> confs: False


In [9]:
# check created file
f1 = h5py.File('EPOC_T1_age_sex.h5','r+')

In [10]:
f1.attrs.keys()

<KeysViewHDF5 ['X_col_names', 'labels']>

In [11]:
f1.attrs.get('X_col_names')

array(['Alter', 'Geschlecht', 'Response'], dtype=object)

In [12]:
f1.attrs.get('labels')

array(['Response'], dtype=object)

In [13]:
pd.DataFrame(f1['X'])

Unnamed: 0,0,1
0,55.0,0.0
1,48.0,0.0
2,24.0,1.0
3,33.0,0.0
4,35.0,1.0
...,...,...
85,31.0,0.0
86,38.0,0.0
87,20.0,1.0
88,36.0,0.0


In [14]:
f1.close()

### HDF5 file with age, sex, and MRI

In [151]:
df_original.head()

Unnamed: 0,Code,Code2,Familiencode,Relation,Status,StatusOCD_binary,Center,Gruppe_Neu,Sample_verlaufBerlin,Finish_vsQuit,...,Cz_400to1000_Av_pos_sweet_GC,Cz_400to1000_Av_pos_affiliation_GC,Cz_400to1000_Av_neg_GC,Cz_400to1000_Av_neg_human_GC,Cz_400to1000_Av_neg_nature_GC,Cz_400to1000_Av_neg_weapons_GC,Cz_400to1000_Av_neutr_GC,Cz_400to1000_Av_neutr_living_GC,Cz_400to1000_Av_neutr_items_GC,Cz_400to1000_Av_neutr_environs_GC
0,epoc_p_1064,EPOC_P_1064,3128.0,Index ohne Verwandte,Index,1.0,Berlin,OCD_Med,1.0,reguläres Therapieende,...,2.407063,3.327511,5.650495,8.410168,2.938624,5.536354,1.214722,2.71101,2.457612,-1.47709
1,epoc_p_1118,EPOC_P_1118,3208.0,Index ohne Verwandte,Index,1.0,Berlin,OCD_Med,1.0,reguläres Therapieende,...,1.433177,0.518778,5.95172,0.995575,2.37625,14.205409,-4.424898,-3.708006,-4.273278,-5.553788
2,epoc_p_1050,EPOC_P_1050,3094.0,Index ohne Verwandte,Index,1.0,Berlin,OCD_Med,1.0,Therapieabbruch,...,2.677901,-2.315895,0.443861,-3.399431,1.61267,3.196841,-2.968737,-4.438069,-0.721836,-3.652792
3,epoc_p_1069,EPOC_P_1069,3133.0,Index ohne Verwandte,Index,1.0,Berlin,OCD_Med,1.0,reguläres Therapieende,...,-0.22183,1.038309,2.689956,0.585481,6.85181,0.681768,-0.506852,1.956024,-1.882689,-1.629634
4,epoc_p_1043,EPOC_P_1043,3077.0,Index ohne Verwandte,Index,1.0,Berlin,OCD_Med,1.0,reguläres Therapieende,...,23.685087,12.619618,12.989751,7.685421,15.390666,16.024837,1.628977,10.361793,-3.753685,-1.953933


In [152]:
df_original.shape

(90, 342)

In [153]:
aseg_dem_pat = pd.read_csv('/home/marijatochadse/3_output/EPOC/freesurfer_dem_pat/FS_all_dem_pat.csv', sep=",")

In [154]:
aseg_dem_pat.head()

Unnamed: 0,subject,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus-Proper,Left-Caudate,Left-Putamen,Left-Pallidum,3rd-Ventricle,...,rh_rLinG_R_volume,rh_vmPOS_R_volume,rh_mOccG_R_volume,rh_MT+_R_volume,rh_OPC_R_volume,rh_iOccG_R_volume,rh_msOccG_R_volume,rh_lsOccG_R_volume,BrainSegVolNotVent.4,eTIV.3
0,epoc_p_1004,5686.7,288.9,12690.1,58991.6,8284.0,3207.3,4425.4,1786.4,972.2,...,3225.0,3149.0,2817.0,2629.0,2824.0,2962.0,2190.0,2715.0,1136482.0,1564502.0
1,epoc_p_1005,4581.4,157.1,14484.8,53596.0,7802.4,3101.7,4596.6,1763.1,718.5,...,1742.0,2070.0,2445.0,2969.0,2469.0,3052.0,1846.0,2053.0,1118617.0,1334034.0
2,epoc_p_1006,7058.9,231.8,14910.9,55451.5,7464.9,3459.2,4485.9,1884.8,1085.3,...,3199.0,3305.0,2081.0,2464.0,2813.0,2680.0,2110.0,2593.0,1003944.0,1152532.0
3,epoc_p_1008,9492.9,160.9,31255.6,43988.5,7728.0,3375.6,5109.4,2127.1,1034.3,...,2353.0,2572.0,2163.0,2784.0,2543.0,2820.0,2629.0,3359.0,1130877.0,1459032.0
4,epoc_p_1012,10666.4,787.6,18939.8,73133.4,9412.4,3716.4,4606.5,2446.6,1805.0,...,3058.0,2807.0,3360.0,3732.0,2984.0,3913.0,2136.0,2489.0,1359211.0,1857095.0


In [155]:
aseg_dem_pat.shape

(90, 497)

In [156]:
#df_vars_new = pd.concat([df_vars, aseg_dem_pat.iloc[:, 1:]], axis=1, ignore_index=True, sort=False)
df_vars_new = pd.concat([df_vars, aseg_dem_pat.iloc[:, 1:]], axis=1)

In [157]:
df_vars_new.shape

(90, 499)

In [158]:
df_vars_new.head()

Unnamed: 0,Alter,Geschlecht,Response,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus-Proper,Left-Caudate,Left-Putamen,...,rh_rLinG_R_volume,rh_vmPOS_R_volume,rh_mOccG_R_volume,rh_MT+_R_volume,rh_OPC_R_volume,rh_iOccG_R_volume,rh_msOccG_R_volume,rh_lsOccG_R_volume,BrainSegVolNotVent.4,eTIV.3
0,55.0,0.0,1.0,5686.7,288.9,12690.1,58991.6,8284.0,3207.3,4425.4,...,3225.0,3149.0,2817.0,2629.0,2824.0,2962.0,2190.0,2715.0,1136482.0,1564502.0
1,48.0,0.0,1.0,4581.4,157.1,14484.8,53596.0,7802.4,3101.7,4596.6,...,1742.0,2070.0,2445.0,2969.0,2469.0,3052.0,1846.0,2053.0,1118617.0,1334034.0
2,24.0,1.0,0.0,7058.9,231.8,14910.9,55451.5,7464.9,3459.2,4485.9,...,3199.0,3305.0,2081.0,2464.0,2813.0,2680.0,2110.0,2593.0,1003944.0,1152532.0
3,33.0,0.0,1.0,9492.9,160.9,31255.6,43988.5,7728.0,3375.6,5109.4,...,2353.0,2572.0,2163.0,2784.0,2543.0,2820.0,2629.0,3359.0,1130877.0,1459032.0
4,35.0,1.0,1.0,10666.4,787.6,18939.8,73133.4,9412.4,3716.4,4606.5,...,3058.0,2807.0,3360.0,3732.0,2984.0,3913.0,2136.0,2489.0,1359211.0,1857095.0


In [159]:
#df_vars_new.rename(columns={"Alter": "age"})

In [160]:
pd.set_option('display.max_rows', 100)

In [161]:
df_vars_new.dtypes

Alter                     float64
Geschlecht                float64
Response                  float64
Left-Lateral-Ventricle    float64
Left-Inf-Lat-Vent         float64
                           ...   
rh_iOccG_R_volume         float64
rh_msOccG_R_volume        float64
rh_lsOccG_R_volume        float64
BrainSegVolNotVent.4      float64
eTIV.3                    float64
Length: 499, dtype: object

In [25]:
# create HDF5 file
create_h5('EPOC_T1_MRI_age_sex.h5', df_vars_new, idx)

<KeysViewHDF5 ['Response', 'X', 'i']> <KeysViewHDF5 ['X_col_names', 'labels']> confs: False


In [26]:
# check created file
f2 = h5py.File('EPOC_T1_MRI_age_sex.h5','r+')

In [27]:
f2.attrs.keys()

<KeysViewHDF5 ['X_col_names', 'labels']>

In [28]:
pd.DataFrame(f2['X'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,488,489,490,491,492,493,494,495,496,497
0,55.0,0.0,5686.7,288.9,12690.1,58991.6,8284.0,3207.3,4425.4,1786.4,...,3225.0,3149.0,2817.0,2629.0,2824.0,2962.0,2190.0,2715.0,1136482.0,1564502.0
1,48.0,0.0,4581.4,157.1,14484.8,53596.0,7802.4,3101.7,4596.6,1763.1,...,1742.0,2070.0,2445.0,2969.0,2469.0,3052.0,1846.0,2053.0,1118617.0,1334034.0
2,24.0,1.0,7058.9,231.8,14910.9,55451.5,7464.9,3459.2,4485.9,1884.8,...,3199.0,3305.0,2081.0,2464.0,2813.0,2680.0,2110.0,2593.0,1003944.0,1152532.0
3,33.0,0.0,9492.9,160.9,31255.6,43988.5,7728.0,3375.6,5109.4,2127.1,...,2353.0,2572.0,2163.0,2784.0,2543.0,2820.0,2629.0,3359.0,1130877.0,1459032.0
4,35.0,1.0,10666.4,787.6,18939.8,73133.4,9412.4,3716.4,4606.5,2446.6,...,3058.0,2807.0,3360.0,3732.0,2984.0,3913.0,2136.0,2489.0,1359211.0,1857095.0
5,45.0,0.0,4881.4,164.7,12021.0,55916.3,5850.7,3446.9,4401.3,1723.2,...,3094.0,2849.0,2126.0,2670.0,1878.0,2321.0,1859.0,1987.0,971686.0,1262840.0
6,29.0,0.0,4058.9,319.7,12959.0,56329.5,7890.3,3762.8,4739.5,1991.3,...,2903.0,2735.0,2402.0,3681.0,2457.0,2749.0,2730.0,3701.0,1107003.0,1122072.0
7,36.0,1.0,10188.1,240.3,13629.7,53313.4,8306.9,4020.2,5158.7,2269.1,...,3463.0,2545.0,2692.0,3670.0,3098.0,3276.0,2192.0,1818.0,1128710.0,1272059.0
8,41.0,0.0,7110.0,574.9,14963.4,55852.4,8119.3,3621.0,4751.6,2087.9,...,2883.0,3086.0,4244.0,3290.0,3206.0,3255.0,3105.0,3207.0,1236897.0,1468960.0
9,18.0,1.0,7354.4,268.2,13392.3,59795.8,7151.6,3694.4,4382.3,1982.9,...,2604.0,3258.0,2399.0,2303.0,3000.0,2273.0,2148.0,2851.0,1057807.0,1225981.0


In [29]:
f2.attrs.get('X_col_names')

array(['Alter', 'Geschlecht', 'Response', 'Left-Lateral-Ventricle',
       'Left-Inf-Lat-Vent', 'Left-Cerebellum-White-Matter',
       'Left-Cerebellum-Cortex', 'Left-Thalamus-Proper', 'Left-Caudate',
       'Left-Putamen', 'Left-Pallidum', '3rd-Ventricle', '4th-Ventricle',
       'Brain-Stem', 'Left-Hippocampus', 'Left-Amygdala', 'CSF',
       'Left-Accumbens-area', 'Left-VentralDC', 'Left-vessel',
       'Left-choroid-plexus', 'Right-Lateral-Ventricle',
       'Right-Inf-Lat-Vent', 'Right-Cerebellum-White-Matter',
       'Right-Cerebellum-Cortex', 'Right-Thalamus-Proper',
       'Right-Caudate', 'Right-Putamen', 'Right-Pallidum',
       'Right-Hippocampus', 'Right-Amygdala', 'Right-Accumbens-area',
       'Right-VentralDC', 'Right-vessel', 'Right-choroid-plexus',
       '5th-Ventricle', 'WM-hypointensities', 'Left-WM-hypointensities',
       'Right-WM-hypointensities', 'non-WM-hypointensities',
       'Left-non-WM-hypointensities', 'Right-non-WM-hypointensities',
       'Optic-Chiasm

In [30]:
f2.close()

### Create HDF5 file with MRI, and age and sex as confounds

In [171]:
cats = df_original.columns.values.tolist()
df_original, df_vars, scale_info, idx = format_df(file_path, cats)

In [172]:
scale_info

['categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categorical',
 'numerical',
 'categori

In [199]:
confounds = ['Geschlecht', 'Alter']

In [200]:
confs_scale_info = []

for conf_i in confounds:
    if df_original.dtypes[conf_i] == 'object':
        confs_scale_info.append('categorical')
    
    if df_original.dtypes[conf_i] == 'float64':
        confs_scale_info.append('numerical')

In [201]:
confs_scale_info

['categorical', 'numerical']

In [32]:
# THIS WILL NOT WORK ANYMORE
create_h5('EPOC_T1_MRI_conf_age_sex.h5', df_vars_new, idx, confounds)

<KeysViewHDF5 ['Geschlecht', 'Response', 'X', 'age', 'i']> <KeysViewHDF5 ['X_col_names', 'confs', 'labels']> confs: ['Geschlecht', 'age']


In [4]:
# check created file
f3 = h5py.File('EPOC_T1_MRI_conf_age_sex.h5','r+')

In [5]:
f3.keys()

<KeysViewHDF5 ['Geschlecht', 'Response', 'X', 'age', 'i']>

In [6]:
f3.attrs.keys()

<KeysViewHDF5 ['X_col_names', 'confs', 'labels']>

In [20]:
f3.attrs.get('labels')

array(['Response'], dtype=object)

In [23]:
#test =f3.attrs["labels"].tolist()
#labels = pd.DataFrame({label :np.array(f3[label]) for label in test})

In [36]:
f3.attrs.get('X_col_names')

array(['Left-Lateral-Ventricle', 'Left-Inf-Lat-Vent',
       'Left-Cerebellum-White-Matter', 'Left-Cerebellum-Cortex',
       'Left-Thalamus-Proper', 'Left-Caudate', 'Left-Putamen',
       'Left-Pallidum', '3rd-Ventricle', '4th-Ventricle', 'Brain-Stem',
       'Left-Hippocampus', 'Left-Amygdala', 'CSF', 'Left-Accumbens-area',
       'Left-VentralDC', 'Left-vessel', 'Left-choroid-plexus',
       'Right-Lateral-Ventricle', 'Right-Inf-Lat-Vent',
       'Right-Cerebellum-White-Matter', 'Right-Cerebellum-Cortex',
       'Right-Thalamus-Proper', 'Right-Caudate', 'Right-Putamen',
       'Right-Pallidum', 'Right-Hippocampus', 'Right-Amygdala',
       'Right-Accumbens-area', 'Right-VentralDC', 'Right-vessel',
       'Right-choroid-plexus', '5th-Ventricle', 'WM-hypointensities',
       'Left-WM-hypointensities', 'Right-WM-hypointensities',
       'non-WM-hypointensities', 'Left-non-WM-hypointensities',
       'Right-non-WM-hypointensities', 'Optic-Chiasm', 'CC_Posterior',
       'CC_Mid_Posterio

In [37]:
f3.attrs.get('confs')

array([['Geschlecht', 'age']], dtype=object)

In [None]:
pd.DataFrame(f3['X'])

In [41]:
f3.close()

### Delete some values for missing values file

In [173]:
df_vars_new_miss = df_vars_new

In [174]:
df_vars_new_miss.iloc[3,3] = np.nan
df_vars_new_miss.iloc[12,3] = np.nan
df_vars_new_miss.iloc[60,3] = np.nan


df_vars_new_miss.iloc[2,4] = np.nan
df_vars_new_miss.iloc[11,4] = np.nan
df_vars_new_miss.iloc[30,4] = np.nan

In [175]:
df_vars_new_miss

Unnamed: 0,Alter,Geschlecht,Response,Left-Lateral-Ventricle,Left-Inf-Lat-Vent,Left-Cerebellum-White-Matter,Left-Cerebellum-Cortex,Left-Thalamus-Proper,Left-Caudate,Left-Putamen,...,rh_rLinG_R_volume,rh_vmPOS_R_volume,rh_mOccG_R_volume,rh_MT+_R_volume,rh_OPC_R_volume,rh_iOccG_R_volume,rh_msOccG_R_volume,rh_lsOccG_R_volume,BrainSegVolNotVent.4,eTIV.3
0,55.0,0.0,1.0,5686.7,288.9,12690.1,58991.6,8284.0,3207.3,4425.4,...,3225.0,3149.0,2817.0,2629.0,2824.0,2962.0,2190.0,2715.0,1136482.0,1564502.0
1,48.0,0.0,1.0,4581.4,157.1,14484.8,53596.0,7802.4,3101.7,4596.6,...,1742.0,2070.0,2445.0,2969.0,2469.0,3052.0,1846.0,2053.0,1118617.0,1334034.0
2,24.0,1.0,0.0,7058.9,,14910.9,55451.5,7464.9,3459.2,4485.9,...,3199.0,3305.0,2081.0,2464.0,2813.0,2680.0,2110.0,2593.0,1003944.0,1152532.0
3,33.0,0.0,1.0,,160.9,31255.6,43988.5,7728.0,3375.6,5109.4,...,2353.0,2572.0,2163.0,2784.0,2543.0,2820.0,2629.0,3359.0,1130877.0,1459032.0
4,35.0,1.0,1.0,10666.4,787.6,18939.8,73133.4,9412.4,3716.4,4606.5,...,3058.0,2807.0,3360.0,3732.0,2984.0,3913.0,2136.0,2489.0,1359211.0,1857095.0
5,45.0,0.0,1.0,4881.4,164.7,12021.0,55916.3,5850.7,3446.9,4401.3,...,3094.0,2849.0,2126.0,2670.0,1878.0,2321.0,1859.0,1987.0,971686.0,1262840.0
6,29.0,0.0,1.0,4058.9,319.7,12959.0,56329.5,7890.3,3762.8,4739.5,...,2903.0,2735.0,2402.0,3681.0,2457.0,2749.0,2730.0,3701.0,1107003.0,1122072.0
7,36.0,1.0,0.0,10188.1,240.3,13629.7,53313.4,8306.9,4020.2,5158.7,...,3463.0,2545.0,2692.0,3670.0,3098.0,3276.0,2192.0,1818.0,1128710.0,1272059.0
8,41.0,0.0,1.0,7110.0,574.9,14963.4,55852.4,8119.3,3621.0,4751.6,...,2883.0,3086.0,4244.0,3290.0,3206.0,3255.0,3105.0,3207.0,1236897.0,1468960.0
9,18.0,1.0,0.0,7354.4,268.2,13392.3,59795.8,7151.6,3694.4,4382.3,...,2604.0,3258.0,2399.0,2303.0,3000.0,2273.0,2148.0,2851.0,1057807.0,1225981.0


In [177]:
# THIS WILL NOT WORK ANYMORE
create_h5('EPOC_T1_MRI_conf_age_sex_missval.h5', df_vars_new_miss, scale_info, idx)

<KeysViewHDF5 ['Response', 'X', 'i', 'scale_info']> <KeysViewHDF5 ['X_col_names', 'labels']> confs: False


In [178]:
df = h5py.File('EPOC_T1_MRI_conf_age_sex_missval.h5', "r")
X = np.array(df["X"])
y = np.array(df["Response"])
inf = list(df["scale_info"])

In [182]:
X[:,3]

array([288.9, 157.1,   nan, 160.9, 787.6, 164.7, 319.7, 240.3, 574.9,
       268.2, 471.6,   nan, 108.1, 318.5, 287.2, 216.7, 428.9, 382.1,
       486.4, 467.4, 185.4, 177.4, 378.4, 405.3, 380.5, 750.4, 206.6,
       451.3, 221.8, 440.6,   nan, 179.6, 210.9, 286. , 216.5, 191.4,
        95.5, 292.3, 406.8, 288.7, 289.2, 180.9, 191.6, 555.4, 418.9,
       260.7, 405.7, 304. , 355.8, 222.1, 139.9, 407.2, 354. , 280.3,
       373.8, 201.8, 623.9, 388.7, 465.8, 248.3, 277.8, 397. , 196.4,
       354.7, 304.7, 121.7, 264.8, 550.3, 364.7, 272.7, 318.6, 492.6,
       182.8, 686.6, 293.5, 332.9, 315.6, 385.6, 704.9, 151.7, 382.7,
       202.5, 402. ,  86.1, 151.2, 629.4, 603.1, 306. , 294.2, 305.4])

In [None]:
df.close()

### Make h5 file with missing data

In [100]:
file_path = '/home/marijatochadse/1_data/EPOC/EPOC_T1_pat.csv'
df_original = pd.read_csv(file_path, sep=",")  
df_vars = df_original
cats = df_original.columns.values.tolist()

In [101]:
scale_info = []

for cat_i in cats:

    if df_vars.dtypes[cat_i] == 'object':
        df_vars[cat_i] = df_vars[cat_i].astype('category')
        scale_info.append('categorical')

    if df_vars.dtypes[cat_i] == 'category': #for categorical variables

        cat_vals = list(df_vars[cat_i].cat.categories)
        
        
        cat_n = len(df_vars[cat_i].cat.categories)
        cat_idx = list(np.arange(cat_n))
        df_vars[cat_i].replace(cat_vals, cat_idx, inplace=True) #replace with integers
        scale_info.append('numerical')

idx = list(df_vars.index.values) #the index values per subject

In [None]:
df_vars_new_miss.iloc[7,0] = np.nan
df_vars_new_miss.iloc[14,0] = np.nan
df_vars_new_miss.iloc[70,0] = np.nan


df_vars_new_miss.iloc[6,1] = np.nan
df_vars_new_miss.iloc[18,1] = np.nan
df_vars_new_miss.iloc[5,1] = np.nan

In [83]:
create_h5('EPOC_demo_missing_test.h5', df_vars, idx)

<KeysViewHDF5 ['Response', 'X', 'i']> <KeysViewHDF5 ['X_col_names', 'labels']> confs: False


In [102]:
create_h5('EPOC_demo_missing_scale_test.h5', df_vars, scale_info, idx)

<KeysViewHDF5 ['Response', 'X', 'i', 'scale_info']> <KeysViewHDF5 ['X_col_names', 'labels']> confs: False


In [141]:
df = h5py.File("EPOC_demo_missing_scale_test.h5", "r")
X = np.array(df["X"])
y = np.array(df["Response"])
inf = list(df["scale_info"])

In [109]:
X

array([[ 4.800000e+01,  4.800000e+01,  3.128000e+03, ...,  2.711010e+00,
         2.457612e+00, -1.477090e+00],
       [ 7.500000e+01,  7.500000e+01,  3.208000e+03, ..., -3.708006e+00,
        -4.273278e+00, -5.553788e+00],
       [ 3.700000e+01,  3.700000e+01,  3.094000e+03, ..., -4.438069e+00,
        -7.218360e-01, -3.652792e+00],
       ...,
       [ 7.000000e+01,  7.000000e+01,  3.190000e+03, ...,           nan,
                  nan,           nan],
       [ 8.000000e+00,  8.000000e+00,  3.036000e+03, ..., -5.279170e-01,
        -2.170540e+00,  3.001500e+00],
       [ 2.000000e+00,  2.000000e+00,  3.018000e+03, ...,  8.872280e-01,
         4.176166e+00, -1.803790e+00]])

In [131]:
inf 

[b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categorical',
 b'numerical',
 b'categ

### Make h5 file with confound missing data

In [215]:
create_h5('EPOC_T1_MRI_conf_age_sex_missing.h5', df_vars_new_miss, scale_info, confs_scale_info, idx, confounds)

<KeysViewHDF5 ['Geschlecht', 'Response', 'X', 'age', 'confs_scale_info', 'i', 'scale_info']> <KeysViewHDF5 ['X_col_names', 'confs', 'labels']> confs: ['Geschlecht', 'age']


In [221]:
df = h5py.File("EPOC_T1_MRI_conf_age_sex_missing.h5", "r")
X = np.array(df["X"])
y = np.array(df["Response"])
inf = list(df["scale_info"])
cinf = list(df["confs_scale_info"])

In [220]:
df.close()