# Data matrix creation

## Read individual files

In [1]:
import os
import pandas as pd


vitals_column_names = ['patientunitstayid', 'observationday', 'systemic_mean', 'systemic_diastolic', 'systemic_systolic', 'respiration', 'heartrate', 'sao2']
labs_column_names = ['person_id', 'visit_occurrence_id', 'measurement_date', 'Sodium level', 'Blood urea nitrogen', 'Creatinine level', 'Potassium level', 'Chloride', 'Hematocrit', 'Haemoglobin estimation', 'Platelet count', 'Red blood cell count', 'Calcium level', 'MCV - Mean corpuscular volume', 'MCHC - Mean corpuscular haemoglobin concentration', 'MCH - Mean corpuscular haemoglobin', 'White blood cell count', 'Red blood cell distribution width', 'Glucose level', 'Bicarbonate level', 'Anion gap']
vitals_avg =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/vitals_avg_episode_date.csv''')[vitals_column_names]
vitals_min =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/vitals_min_episode_date.csv''')[vitals_column_names]
vitals_max =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/vitals_max_episode_date.csv''')[vitals_column_names]
vitals_first =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/vitals_first_episode_date.csv''')[vitals_column_names]
vitals_last =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/vitals_last_episode_date.csv''')[vitals_column_names]
labs_avg =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/labs_avg_episode_date.csv''')[labs_column_names]
labs_min =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/labs_min_episode_date.csv''')[labs_column_names]
labs_max =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/labs_max_episode_date.csv''')[labs_column_names]
labs_first =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/labs_first_episode_date.csv''')[labs_column_names]
labs_last =  pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/cleaned/labs_last_episode_date.csv''')[labs_column_names]
admissions = pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/extracts/admissions.csv''')

In [2]:
vitals_min.shape, vitals_max.shape, vitals_avg.shape, vitals_first.shape, vitals_last.shape

((450728, 8), (450728, 8), (450728, 8), (450728, 8), (450744, 8))

In [3]:
labs_min.shape, labs_max.shape, labs_avg.shape, labs_first.shape, labs_last.shape

((92933, 21), (92933, 21), (92933, 21), (92933, 21), (92933, 21))

In [4]:
admissions.shape

(12694, 4)

In [5]:
admissions["visit_start_date"] = pd.to_datetime(admissions["visit_start_date"])

In [6]:
def formatVitalsData(vitals_df):

    from datetime import timedelta

    vitalsDf = pd.merge(vitals_df, admissions, how='inner', left_on='patientunitstayid', right_on='visit_occurrence_id')
    vitalsDf['measurement_date'] = vitalsDf.apply(lambda x: x[10] + timedelta(days=x[1]), axis=1)
    vitalsDf = vitalsDf[['person_id', 'visit_occurrence_id', 'measurement_date', 'systemic_mean', 'systemic_diastolic', 'systemic_systolic', 'respiration', 'heartrate', 'sao2']]
    return vitalsDf

In [7]:
def formatLabsData(labs_df):

    labsDf = pd.merge(labs_df, admissions, how='inner', left_on=['person_id', 'visit_occurrence_id'], right_on=['person_id', 'visit_occurrence_id'])
    labsDf['measurement_date'] = pd.to_datetime(labsDf["measurement_date"])
    labsDf = labsDf[labs_column_names]
    return labsDf

In [8]:
vitals_avg = formatVitalsData(vitals_avg)
vitals_min = formatVitalsData(vitals_min)
vitals_max = formatVitalsData(vitals_max)
vitals_first = formatVitalsData(vitals_first)
vitals_last = formatVitalsData(vitals_last)

In [9]:
labs_avg = formatLabsData(labs_avg)
labs_min = formatLabsData(labs_min)
labs_max = formatLabsData(labs_max)
labs_first = formatLabsData(labs_first)
labs_last = formatLabsData(labs_last)

In [10]:
vitals_min.shape, vitals_max.shape, vitals_avg.shape, vitals_first.shape, vitals_last.shape

((49310, 9), (52873, 9), (49486, 9), (50610, 9), (50353, 9))

In [11]:
labs_min.shape, labs_max.shape, labs_avg.shape, labs_first.shape, labs_last.shape

((92933, 21), (92933, 21), (92933, 21), (92933, 21), (92933, 21))

In [12]:
admissions.shape

(12694, 4)

In [13]:
admissions

Unnamed: 0,person_id,visit_occurrence_id,visit_start_date,death
0,210009,224606,2014-07-13,0
1,210052,151900,2014-11-02,0
2,210079,151179,2014-05-10,1
3,210300,145917,2015-05-04,0
4,210300,210208,2015-06-04,0
...,...,...,...,...
12689,358850,3336321,2015-10-24,0
12690,358879,3338553,2015-02-28,0
12691,359296,3337594,2015-04-16,0
12692,359430,3351293,2015-04-24,1


In [14]:
vitals_avg

Unnamed: 0,person_id,visit_occurrence_id,measurement_date,systemic_mean,systemic_diastolic,systemic_systolic,respiration,heartrate,sao2
0,248364,141515,2014-04-04,65.344086,47.924731,105.265233,25.763066,89.777003,96.218638
1,248364,141515,2014-04-05,68.147368,51.291228,99.049123,23.763889,101.937500,98.752613
2,248364,141515,2014-04-06,78.180556,57.565972,111.343750,23.731707,109.117021,97.958333
3,248364,141515,2014-04-08,75.000000,53.103306,114.359504,30.335664,81.479021,99.752613
4,248364,141515,2014-04-09,94.001586,70.794606,142.012036,23.260417,83.906250,99.187500
...,...,...,...,...,...,...,...,...,...
49481,3521842,3352884,2014-01-29,89.946929,66.934130,137.592436,19.909722,100.312500,97.333333
49482,3521842,3352884,2014-01-30,73.849266,50.696327,127.106241,21.030612,82.459184,98.234694
49483,358073,3352922,2015-06-01,72.429688,54.640625,105.199219,14.142857,76.299652,99.982456
49484,358073,3352922,2015-06-02,78.000000,64.023256,97.302326,17.929821,77.197917,99.989583


In [15]:
labs_avg

Unnamed: 0,person_id,visit_occurrence_id,measurement_date,Sodium level,Blood urea nitrogen,Creatinine level,Potassium level,Chloride,Hematocrit,Haemoglobin estimation,...,Red blood cell count,Calcium level,MCV - Mean corpuscular volume,MCHC - Mean corpuscular haemoglobin concentration,MCH - Mean corpuscular haemoglobin,White blood cell count,Red blood cell distribution width,Glucose level,Bicarbonate level,Anion gap
0,248364,141515,2014-04-05,136.0,72.0,1.88,3.30,104.0,30.3,10.4,...,3.29,7.6,92.1,34.3,31.6,6.5,16.7,105.5,21.0,14.0
1,248364,141515,2014-04-06,141.0,60.0,1.44,4.40,110.0,30.6,10.3,...,3.34,7.9,91.6,33.7,30.8,12.9,17.3,105.0,22.0,13.0
2,248364,141515,2014-04-07,143.0,51.0,1.13,3.70,113.0,28.7,9.7,...,3.14,8.2,91.4,33.8,30.9,11.7,17.0,123.0,22.0,12.0
3,248364,141515,2014-04-08,145.0,38.0,0.96,3.60,113.0,27.2,9.0,...,2.86,8.0,95.1,33.1,31.5,9.9,17.2,116.0,22.0,14.0
4,248364,141515,2014-04-09,148.0,31.0,0.97,4.00,118.0,27.3,9.0,...,2.92,7.8,93.5,33.0,30.8,12.2,17.4,87.0,22.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92928,358073,3352922,2015-06-07,149.0,13.0,1.36,3.55,110.0,34.0,11.4,...,3.88,8.9,88.0,33.0,29.0,4.8,16.7,123.0,31.0,8.0
92929,358073,3352922,2015-06-08,150.0,12.0,1.35,3.50,110.0,35.0,11.3,...,3.98,8.9,88.0,32.0,28.0,5.3,17.5,119.0,32.0,8.0
92930,358073,3352922,2015-06-09,149.0,11.0,1.29,3.45,111.0,37.0,12.0,...,4.16,8.8,89.0,33.0,29.0,4.4,17.8,172.0,31.0,7.0
92931,358073,3352922,2015-06-10,149.0,11.0,1.30,3.55,113.0,37.0,12.0,...,4.15,8.8,89.0,33.0,29.0,5.8,18.0,151.0,28.0,8.0


## Merge all the files

In [16]:
import pandas as pd


mergedDf = pd.concat([vitals_avg, vitals_min, vitals_max, vitals_first, vitals_last, labs_avg, labs_min, labs_max, labs_first, labs_last], ignore_index=True)
mergedDf = mergedDf.drop_duplicates(subset=['person_id', 'visit_occurrence_id', 'measurement_date'])
mergedDf = mergedDf[['person_id', 'visit_occurrence_id', 'measurement_date']]

mergedDf = pd.merge(mergedDf, admissions.add_suffix('_adm'), how='inner', left_on=['person_id', 'visit_occurrence_id'], right_on=['person_id_adm', 'visit_occurrence_id_adm'])
mergedDf = mergedDf.drop(['person_id_adm', 'visit_occurrence_id_adm'], axis = 1)
mergedDf = pd.merge(mergedDf, vitals_avg.add_suffix('_avg'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_avg', 'visit_occurrence_id_avg', 'measurement_date_avg'])
mergedDf = mergedDf.drop(['person_id_avg', 'visit_occurrence_id_avg', 'measurement_date_avg'], axis = 1)
mergedDf = pd.merge(mergedDf, vitals_min.add_suffix('_min'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_min', 'visit_occurrence_id_min', 'measurement_date_min'])
mergedDf = mergedDf.drop(['person_id_min', 'visit_occurrence_id_min', 'measurement_date_min'], axis = 1)
mergedDf = pd.merge(mergedDf, vitals_max.add_suffix('_max'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_max', 'visit_occurrence_id_max', 'measurement_date_max'])
mergedDf = mergedDf.drop(['person_id_max', 'visit_occurrence_id_max', 'measurement_date_max'], axis = 1)
mergedDf = pd.merge(mergedDf, vitals_first.add_suffix('_first'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_first', 'visit_occurrence_id_first', 'measurement_date_first'])
mergedDf = mergedDf.drop(['person_id_first', 'visit_occurrence_id_first', 'measurement_date_first'], axis = 1)
mergedDf = pd.merge(mergedDf, vitals_last.add_suffix('_last'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_last', 'visit_occurrence_id_last', 'measurement_date_last'])
mergedDf = mergedDf.drop(['person_id_last', 'visit_occurrence_id_last', 'measurement_date_last'], axis = 1)
mergedDf = pd.merge(mergedDf, labs_avg.add_suffix('_avg'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_avg', 'visit_occurrence_id_avg', 'measurement_date_avg'])
mergedDf = mergedDf.drop(['person_id_avg', 'visit_occurrence_id_avg', 'measurement_date_avg'], axis = 1)
mergedDf = pd.merge(mergedDf, labs_min.add_suffix('_min'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_min', 'visit_occurrence_id_min', 'measurement_date_min'])
mergedDf = mergedDf.drop(['person_id_min', 'visit_occurrence_id_min', 'measurement_date_min'], axis = 1)
mergedDf = pd.merge(mergedDf, labs_max.add_suffix('_max'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_max', 'visit_occurrence_id_max', 'measurement_date_max'])
mergedDf = mergedDf.drop(['person_id_max', 'visit_occurrence_id_max', 'measurement_date_max'], axis = 1)
mergedDf = pd.merge(mergedDf, labs_max.add_suffix('_first'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_first', 'visit_occurrence_id_first', 'measurement_date_first'])
mergedDf = mergedDf.drop(['person_id_first', 'visit_occurrence_id_first', 'measurement_date_first'], axis = 1)
mergedDf = pd.merge(mergedDf, labs_max.add_suffix('_last'), how='left', left_on=['person_id', 'visit_occurrence_id', 'measurement_date'], right_on=['person_id_last', 'visit_occurrence_id_last', 'measurement_date_last'])
mergedDf = mergedDf.drop(['person_id_last', 'visit_occurrence_id_last', 'measurement_date_last'], axis = 1)
mergedDf = mergedDf.dropna()
mergedDf

Unnamed: 0,person_id,visit_occurrence_id,measurement_date,visit_start_date_adm,death_adm,systemic_mean_avg,systemic_diastolic_avg,systemic_systolic_avg,respiration_avg,heartrate_avg,...,Red blood cell count_last,Calcium level_last,MCV - Mean corpuscular volume_last,MCHC - Mean corpuscular haemoglobin concentration_last,MCH - Mean corpuscular haemoglobin_last,White blood cell count_last,Red blood cell distribution width_last,Glucose level_last,Bicarbonate level_last,Anion gap_last
3,248364,141515,2014-04-08,2014-04-04,1,75.000000,53.103306,114.359504,30.335664,81.479021,...,2.86,8.0,95.1,33.1,31.5,9.9,17.2,116.0,22.0,14.0
8,275863,141751,2014-11-29,2014-11-28,0,82.732639,60.638889,120.218750,21.090278,103.961806,...,3.68,7.7,80.2,36.6,29.3,13.4,12.7,216.0,19.0,17.0
10,275863,141751,2014-12-01,2014-11-28,0,88.877193,67.184211,116.228070,20.728070,87.194118,...,3.31,8.1,81.3,35.7,29.0,13.5,13.8,115.0,23.0,10.0
15,238284,141959,2014-01-24,2014-01-24,0,74.113636,47.795455,118.534091,19.254985,67.384615,...,3.63,7.4,88.2,31.6,27.8,12.4,14.6,233.0,27.0,14.0
16,238284,141959,2014-01-25,2014-01-24,0,80.972028,53.451049,126.541958,17.677260,79.479021,...,3.77,6.6,88.1,31.9,28.1,13.8,15.0,122.0,22.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103941,3516078,3352827,2015-07-22,2015-07-15,1,87.024007,63.609013,132.066452,15.968750,84.565972,...,2.62,9.0,93.0,32.0,30.0,12.2,15.0,206.0,33.0,8.0
103942,3516078,3352827,2015-07-23,2015-07-15,1,78.591413,59.196659,118.586644,17.927083,92.722222,...,2.73,9.0,93.0,32.0,30.0,11.2,15.2,165.0,33.0,8.0
103948,3516078,3352827,2015-07-31,2015-07-15,1,71.217135,49.538315,121.737472,20.833333,93.670139,...,2.69,9.1,92.0,33.0,30.0,11.8,17.5,167.0,27.0,7.0
103957,3521842,3352884,2014-01-30,2014-01-28,0,73.849266,50.696327,127.106241,21.030612,82.459184,...,3.32,7.8,92.0,35.0,33.0,11.4,14.1,98.0,21.0,3.0


In [17]:
len(mergedDf.visit_occurrence_id.unique())

6801

## Save the merged data to a file

In [18]:
mergedDf.to_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/final/data_matrix.csv', index=False)