# Find common attributes

## Read data files

In [1]:
import os
import pandas as pd


mimicDataDf = pd.read_csv(os.environ['MIMIC_EHR_PIPELINE_BASE'] + '/data/sepsis_icd_cohort/final/data_matrix_v1.csv')
eicuDataDf = pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/final/data_matrix.csv')

mimicDataDf.shape, eicuDataDf.shape

((31646, 278), (56237, 125))

## Vitals

In [14]:
set(['_'.join(column.split('_')[1:-1]) for column in mimicDataDf.columns if column.startswith('vitals_')])

{'Body temperature',
 'Body weight',
 'Diastolic blood pressure',
 'Friction and shear Braden scale',
 'Glasgow coma score eye opening',
 'Glasgow coma score motor',
 'Glasgow coma score verbal',
 'Heart rate',
 'Mean blood pressure',
 'Moisture exposure Braden scale',
 'Nutrition intake pattern Braden scale',
 'Oxygen saturation in Arterial blood by Pulse oximetry',
 'Physical activity Braden scale',
 'Physical mobility Braden scale',
 'Respiratory rate',
 'Sensory perception Braden scale',
 'Systolic blood pressure'}

In [13]:
set(['_'.join(column.split('_')[1:-1]) for column in eicuDataDf.columns if column.startswith('vitals_')])

{'heartrate',
 'respiration',
 'sao2',
 'systemic_diastolic',
 'systemic_mean',
 'systemic_systolic'}

## Labs

In [16]:
set(['_'.join(column.split('_')[1:-1]) for column in mimicDataDf.columns if column.startswith('labs_')])

{'Alanine aminotransferase in Serum or Plasma',
 'Albumin in Serum or Plasma',
 'Alkaline phosphatase in Serum or Plasma',
 'Anion gap 4 in Serum or Plasma',
 'Aspartate aminotransferase in Serum or Plasma',
 'Basophils in Blood by Automated count',
 'Bicarbonate in Serum or Plasma',
 'Bilirubin.total in Serum or Plasma',
 'Calcium in Serum or Plasma',
 'Chloride in Serum or Plasma',
 'Creatinine in Serum or Plasma',
 'Eosinophils in Blood by Automated count',
 'Erythrocyte distribution width by Automated count',
 'Erythrocytes in Blood by Automated count',
 'Glucose in Serum or Plasma',
 'Hematocrit of Blood by Automated count',
 'Hemoglobin in Blood',
 'INR in Platelet poor plasma by Coagulation assay',
 'Leukocytes in Blood by Manual count',
 'Lymphocytes in Blood by Automated count',
 'MCH by Automated count',
 'MCHC by Automated count',
 'MCV by Automated count',
 'Magnesium in Serum or Plasma',
 'Monocytes in Blood by Automated count',
 'Neutrophils in Urine by Automated count',


In [17]:
set(['_'.join(column.split('_')[1:-1]) for column in eicuDataDf.columns if column.startswith('labs_')])

{'Anion gap',
 'Bicarbonate level',
 'Blood urea nitrogen',
 'Calcium level',
 'Chloride',
 'Creatinine level',
 'Glucose level',
 'Haemoglobin estimation',
 'Hematocrit',
 'MCH - Mean corpuscular haemoglobin',
 'MCHC - Mean corpuscular haemoglobin concentration',
 'MCV - Mean corpuscular volume',
 'Platelet count',
 'Potassium level',
 'Red blood cell count',
 'Red blood cell distribution width',
 'Sodium level',
 'White blood cell count'}

## Create a mapper

In [3]:
import pandas as pd


mappingData = [
    ['vitals', 'systemic_diastolic', 'systemic_diastolic', 'Diastolic blood pressure'],
    ['vitals', 'systemic_systolic', 'systemic_systolic', 'Glasgow coma score eye opening'],
    ['vitals', 'sao2', 'sao2', 'Oxygen saturation in Arterial blood by Pulse oximetry'],
    ['vitals', 'respiration', 'respiration', 'Respiratory rate'],
    ['vitals', 'systemic_mean', 'systemic_mean', 'Systolic blood pressure'],
    ['labs', 'anion_gap', 'Anion gap', 'Anion gap 4 in Serum or Plasma'],
    ['labs', 'bicarbonate_level', 'Bicarbonate level', 'Bicarbonate in Serum or Plasma'],
    ['labs', 'blood_urea_nitrogen', 'Blood urea nitrogen', 'Urea nitrogen in Serum or Plasma'],
    ['labs', 'calcium_level', 'Calcium level', 'Calcium in Serum or Plasma'],
    ['labs', 'chloride', 'Chloride', 'Chloride in Serum or Plasma'],
    ['labs', 'creatinine_level', 'Creatinine level', 'Creatinine in Serum or Plasma'],
    ['labs', 'glucose_level', 'Glucose level', 'Glucose in Serum or Plasma'],
    ['labs', 'haemoglobin_estimation', 'Haemoglobin estimation', 'Hemoglobin in Blood'],
    ['labs', 'hematocrit', 'Hematocrit', 'Hematocrit of Blood by Automated count'],
    ['labs', 'MCH', 'MCH - Mean corpuscular haemoglobin', 'MCH by Automated count'],
    ['labs', 'MCHC', 'MCHC - Mean corpuscular haemoglobin concentration', 'MCHC by Automated count'],
    ['labs', 'MCV', 'MCV - Mean corpuscular volume', 'MCV by Automated count'],
    ['labs', 'platelet_count', 'Platelet count', 'Platelets in Blood by Automated count'],
    ['labs', 'potassium_level', 'Potassium level', 'Potassium in Serum or Plasma'],
    ['labs', 'sodium_level', 'Sodium level', 'Sodium in Serum or Plasma'],
]

mappingDf = pd.DataFrame(mappingData, columns=['column_type', 'common_column_name', 'eicu_column_name', 'mimic_column_name'])
mappingDf

Unnamed: 0,column_type,common_column_name,eicu_column_name,mimic_column_name
0,vitals,systemic_diastolic,systemic_diastolic,Diastolic blood pressure
1,vitals,systemic_systolic,systemic_systolic,Glasgow coma score eye opening
2,vitals,sao2,sao2,Oxygen saturation in Arterial blood by Pulse o...
3,vitals,respiration,respiration,Respiratory rate
4,vitals,systemic_mean,systemic_mean,Systolic blood pressure
5,labs,anion_gap,Anion gap,Anion gap 4 in Serum or Plasma
6,labs,bicarbonate_level,Bicarbonate level,Bicarbonate in Serum or Plasma
7,labs,blood_urea_nitrogen,Blood urea nitrogen,Urea nitrogen in Serum or Plasma
8,labs,calcium_level,Calcium level,Calcium in Serum or Plasma
9,labs,chloride,Chloride,Chloride in Serum or Plasma


## Save mapper

In [4]:
from pathlib import Path


commonDirPath = Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/inter_source_validation''')
commonDirPath.mkdir(parents=True, exist_ok=True)

mapperFilePath = Path(commonDirPath, 'common_columns_mapper.csv')

mappingDf.to_csv(mapperFilePath, index=False)

## Read mapper

In [5]:
import pandas as pd

from pathlib import Path


mappingDf = pd.read_csv(Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/inter_source_validation''', '''common_columns_mapper.csv'''))
mappingDf

Unnamed: 0,column_type,common_column_name,eicu_column_name,mimic_column_name
0,vitals,systemic_diastolic,systemic_diastolic,Diastolic blood pressure
1,vitals,systemic_systolic,systemic_systolic,Glasgow coma score eye opening
2,vitals,sao2,sao2,Oxygen saturation in Arterial blood by Pulse o...
3,vitals,respiration,respiration,Respiratory rate
4,vitals,systemic_mean,systemic_mean,Systolic blood pressure
5,labs,anion_gap,Anion gap,Anion gap 4 in Serum or Plasma
6,labs,bicarbonate_level,Bicarbonate level,Bicarbonate in Serum or Plasma
7,labs,blood_urea_nitrogen,Blood urea nitrogen,Urea nitrogen in Serum or Plasma
8,labs,calcium_level,Calcium level,Calcium in Serum or Plasma
9,labs,chloride,Chloride,Chloride in Serum or Plasma


## Save data matrix with common column

### A function to get column mappings

In [6]:
def getColumnMappings(searchCol, commonCol, df):
    commonColumnMapper = {}
    for dfCol in df.columns:
        if searchCol in dfCol:
            commonColumnMapper[dfCol] = dfCol.split('_')[0] + '_' + commonCol + '_' + dfCol.split('_')[-1]
    return commonColumnMapper

### Save MIMIC data matrix

In [7]:
mimicToCommonColumnMapperList = [{
    'person_id': 'person_id',
    'visit_occurrence_id': 'visit_occurrence_id',
    'measurement_date': 'measurement_date',
    'visit_start_datetime_adm': 'visit_start_datetime_adm',
    'death_0_30_adm': 'death_0_30_adm'
}]
mimicToCommonColumnMapperList.extend([getColumnMappings(row.mimic_column_name, row.common_column_name, mimicDataDf) for i, row in mappingDf.iterrows()])
MimicToCommonColumnMapper = {mimicColumn: commonColumn for partialMimicToCommonColumnMapper in mimicToCommonColumnMapperList for mimicColumn, commonColumn in partialMimicToCommonColumnMapper.items()}
MimicToCommonColumnMapper

{'person_id': 'person_id',
 'visit_occurrence_id': 'visit_occurrence_id',
 'measurement_date': 'measurement_date',
 'visit_start_datetime_adm': 'visit_start_datetime_adm',
 'death_0_30_adm': 'death_0_30_adm',
 'vitals_Diastolic blood pressure_avg': 'vitals_systemic_diastolic_avg',
 'vitals_Diastolic blood pressure_min': 'vitals_systemic_diastolic_min',
 'vitals_Diastolic blood pressure_max': 'vitals_systemic_diastolic_max',
 'vitals_Diastolic blood pressure_first': 'vitals_systemic_diastolic_first',
 'vitals_Diastolic blood pressure_last': 'vitals_systemic_diastolic_last',
 'vitals_Glasgow coma score eye opening_avg': 'vitals_systemic_systolic_avg',
 'vitals_Glasgow coma score eye opening_min': 'vitals_systemic_systolic_min',
 'vitals_Glasgow coma score eye opening_max': 'vitals_systemic_systolic_max',
 'vitals_Glasgow coma score eye opening_first': 'vitals_systemic_systolic_first',
 'vitals_Glasgow coma score eye opening_last': 'vitals_systemic_systolic_last',
 'vitals_Oxygen saturati

In [8]:
mimicCommonDataDf = mimicDataDf[MimicToCommonColumnMapper.keys()]
mimicCommonDataDf.visit_start_datetime_adm = mimicCommonDataDf.visit_start_datetime_adm.str[:10]
mimicCommonDataDf = mimicCommonDataDf.rename(columns=MimicToCommonColumnMapper)
mimicCommonDataDf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mimicCommonDataDf.visit_start_datetime_adm = mimicCommonDataDf.visit_start_datetime_adm.str[:10]


Unnamed: 0,person_id,visit_occurrence_id,measurement_date,visit_start_datetime_adm,death_0_30_adm,vitals_systemic_diastolic_avg,vitals_systemic_diastolic_min,vitals_systemic_diastolic_max,vitals_systemic_diastolic_first,vitals_systemic_diastolic_last,...,labs_potassium_level_avg,labs_potassium_level_min,labs_potassium_level_max,labs_potassium_level_first,labs_potassium_level_last,labs_sodium_level_avg,labs_sodium_level_min,labs_sodium_level_max,labs_sodium_level_first,labs_sodium_level_last
0,2118937575,-2146830423,2153-08-26,2153-08-25,0,55.083333,55.083333,55.083333,64.0,47.0,...,4.35,4.35,4.35,4.35,4.35,145.000000,145.000000,145.000000,145.000000,145.000000
1,2118937575,-2146830423,2153-08-27,2153-08-25,0,51.541667,51.541667,51.541667,90.0,49.0,...,4.40,4.40,4.40,4.40,4.40,145.000000,145.000000,145.000000,145.000000,145.000000
2,2118937575,-2146830423,2153-08-29,2153-08-25,0,65.428571,65.428571,65.428571,65.0,74.0,...,4.10,4.10,4.10,4.10,4.10,148.666667,148.666667,148.666667,148.666667,148.666667
3,1032588988,-2146621210,2195-02-09,2195-02-02,0,64.800000,64.800000,64.800000,54.0,67.0,...,3.90,3.90,3.90,3.90,3.90,137.000000,137.000000,137.000000,137.000000,137.000000
4,1032588988,-2146621210,2195-02-11,2195-02-02,0,63.250000,63.250000,63.250000,66.0,53.0,...,4.00,4.00,4.00,4.00,4.00,139.000000,139.000000,139.000000,139.000000,139.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31641,949412965,2147302236,2143-10-17,2143-10-01,0,58.700000,58.700000,58.700000,67.0,53.0,...,3.40,3.40,3.40,3.40,3.40,136.000000,136.000000,136.000000,136.000000,136.000000
31642,949412965,2147302236,2143-10-18,2143-10-01,0,65.941176,65.941176,65.941176,52.0,84.0,...,3.90,3.90,3.90,3.90,3.90,134.000000,134.000000,134.000000,134.000000,134.000000
31643,949412965,2147302236,2143-10-19,2143-10-01,0,66.153846,66.153846,66.153846,70.0,61.0,...,3.70,3.70,3.70,3.70,3.70,133.000000,133.000000,133.000000,133.000000,133.000000
31644,949412965,2147302236,2143-10-20,2143-10-01,0,59.000000,59.000000,59.000000,63.0,54.0,...,4.10,4.10,4.10,4.10,4.10,135.000000,135.000000,135.000000,135.000000,135.000000


In [9]:
mimicCommonDataDf.death_0_30_adm.value_counts()

death_0_30_adm
0    26522
1     5124
Name: count, dtype: int64

In [17]:
mimicCommonDataDf.to_csv(Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/inter_source_validation/data''', '''data_matrix_mimic.csv'''), index=False)

### Save eICU data matrix

In [11]:
eicuToCommonColumnMapperList = [{
    'person_id': 'person_id',
    'visit_occurrence_id': 'visit_occurrence_id',
    'measurement_date': 'measurement_date',
    'visit_start_date_adm': 'visit_start_date_adm',
    'death_adm': 'death_adm'
}]
eicuToCommonColumnMapperList.extend([getColumnMappings(row.eicu_column_name, row.common_column_name, eicuDataDf) for i, row in mappingDf.iterrows()])
eicuToCommonColumnMapper = {eicuColumn: commonColumn for partialEicuToCommonColumnMapper in eicuToCommonColumnMapperList for eicuColumn, commonColumn in partialEicuToCommonColumnMapper.items()}
eicuToCommonColumnMapper

{'person_id': 'person_id',
 'visit_occurrence_id': 'visit_occurrence_id',
 'measurement_date': 'measurement_date',
 'visit_start_date_adm': 'visit_start_date_adm',
 'death_adm': 'death_adm',
 'vitals_systemic_diastolic_avg': 'vitals_systemic_diastolic_avg',
 'vitals_systemic_diastolic_min': 'vitals_systemic_diastolic_min',
 'vitals_systemic_diastolic_max': 'vitals_systemic_diastolic_max',
 'vitals_systemic_diastolic_first': 'vitals_systemic_diastolic_first',
 'vitals_systemic_diastolic_last': 'vitals_systemic_diastolic_last',
 'vitals_systemic_systolic_avg': 'vitals_systemic_systolic_avg',
 'vitals_systemic_systolic_min': 'vitals_systemic_systolic_min',
 'vitals_systemic_systolic_max': 'vitals_systemic_systolic_max',
 'vitals_systemic_systolic_first': 'vitals_systemic_systolic_first',
 'vitals_systemic_systolic_last': 'vitals_systemic_systolic_last',
 'vitals_sao2_avg': 'vitals_sao2_avg',
 'vitals_sao2_min': 'vitals_sao2_min',
 'vitals_sao2_max': 'vitals_sao2_max',
 'vitals_sao2_first'

In [12]:
eicuCommonDataDf = eicuDataDf[eicuToCommonColumnMapper.keys()]
eicuCommonDataDf = eicuCommonDataDf.rename(columns=eicuToCommonColumnMapper)
eicuCommonDataDf

Unnamed: 0,person_id,visit_occurrence_id,measurement_date,visit_start_date_adm,death_adm,vitals_systemic_diastolic_avg,vitals_systemic_diastolic_min,vitals_systemic_diastolic_max,vitals_systemic_diastolic_first,vitals_systemic_diastolic_last,...,labs_potassium_level_avg,labs_potassium_level_min,labs_potassium_level_max,labs_potassium_level_first,labs_potassium_level_last,labs_sodium_level_avg,labs_sodium_level_min,labs_sodium_level_max,labs_sodium_level_first,labs_sodium_level_last
0,248364,141515,2014-04-04,2014-04-04,0,47.924731,40.00,74.000,42.00,52.00,...,3.74,3.6,4.0,4.0,4.0,132.666667,131.0,135.0,135.0,135.0
1,248364,141515,2014-04-05,2014-04-04,0,51.291228,38.00,68.000,52.00,50.00,...,3.30,3.3,3.3,3.3,3.3,136.000000,136.0,136.0,136.0,136.0
2,248364,141515,2014-04-06,2014-04-04,0,57.565972,46.00,80.000,54.00,58.00,...,4.40,4.4,4.4,4.4,4.4,141.000000,141.0,141.0,141.0,141.0
3,248364,141515,2014-04-07,2014-04-04,0,57.696864,50.00,86.000,58.00,66.00,...,3.70,3.7,3.7,3.7,3.7,143.000000,143.0,143.0,143.0,143.0
4,248364,141515,2014-04-08,2014-04-04,0,53.103306,40.00,72.000,64.00,54.00,...,3.60,3.6,3.6,3.6,3.6,145.000000,145.0,145.0,145.0,145.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56232,3521842,3352884,2014-01-29,2014-01-28,0,66.934130,68.29,31.000,59.61,79.04,...,4.10,4.1,4.1,4.1,4.1,141.000000,141.0,141.0,141.0,141.0
56233,3521842,3352884,2014-01-30,2014-01-28,0,50.696327,36.34,72.515,64.47,65.50,...,3.90,3.9,3.9,3.9,3.9,139.000000,139.0,139.0,139.0,139.0
56234,358073,3352922,2015-06-02,2015-06-01,0,64.023256,46.00,76.000,61.00,63.00,...,3.55,3.3,3.8,3.8,3.8,141.000000,141.0,141.0,141.0,141.0
56235,358073,3352922,2015-06-03,2015-06-01,0,78.054524,47.93,88.500,72.48,83.62,...,3.40,3.2,3.6,3.6,3.6,143.000000,142.0,144.0,144.0,144.0


In [27]:
import os
import pandas as pd


patientDf = pd.read_csv(Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/eICU''', '''patient.csv'''))
patientDf['person_id'] = patientDf.uniquepid.str.replace('-', '').apply(lambda x : int(x))
patientDf

Unnamed: 0,patientunitstayid,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,...,unitvisitnumber,unitstaytype,admissionweight,dischargeweight,unitdischargetime24,unitdischargeoffset,unitdischargelocation,unitdischargestatus,uniquepid,person_id
0,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,1,admit,84.3,85.8,03:50:00,3596,Death,Expired,002-34851,234851
1,141178,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,1,admit,54.4,54.4,09:18:00,8,Step-Down Unit (SDU),Alive,002-33870,233870
2,141179,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,2,stepdown/other,,60.4,19:20:00,2042,Home,Alive,002-33870,233870
3,141194,128941,Male,68,Caucasian,73,92,"Sepsis, renal/UTI (including bladder)",180.3,18:18:40,...,1,admit,73.9,76.7,15:31:00,4813,Floor,Alive,002-5276,25276
4,141196,128943,Male,71,Caucasian,67,109,,162.6,20:21:00,...,2,stepdown/other,,63.2,22:23:00,1463,Floor,Alive,002-37665,237665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200854,3353235,2743084,Male,50,Caucasian,458,1109,"CHF, congestive heart failure",175.3,04:55:00,...,1,admit,90.0,99.2,23:18:00,1069,Telemetry,Alive,035-16382,3516382
200855,3353237,2743086,Female,79,Caucasian,458,1106,"Embolus, pulmonary",162.6,01:45:00,...,1,admit,78.4,81.4,23:08:00,1269,Step-Down Unit (SDU),Alive,035-751,35751
200856,3353251,2743099,Male,73,African American,458,1104,Cardiac arrest (with or without respiratory ar...,177.8,12:51:00,...,1,admit,102.0,96.2,23:16:00,16259,Telemetry,Alive,035-5166,355166
200857,3353254,2743102,Male,81,Caucasian,459,1108,"Bleeding, lower GI",185.4,07:43:00,...,1,admit,83.9,92.9,19:25:00,431,Step-Down Unit (SDU),Alive,035-19511,3519511


In [37]:
patientDf[(patientDf.unitdischargestatus == 'Expired') & (patientDf.unitdischargeoffset < (30 * 24 * 60))]

Unnamed: 0,patientunitstayid,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,...,unitvisitnumber,unitstaytype,admissionweight,dischargeweight,unitdischargetime24,unitdischargeoffset,unitdischargelocation,unitdischargestatus,uniquepid,person_id
0,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,1,admit,84.30,85.8,03:50:00,3596,Death,Expired,002-34851,234851
22,141297,129026,Male,63,Caucasian,73,85,"Sepsis, pulmonary",162.6,04:18:00,...,2,readmit,,74.4,03:41:00,1869,Death,Expired,002-30269,230269
25,141314,129039,Male,45,Caucasian,73,85,"Aneurysm, abdominal aortic; with rupture",170.2,00:41:00,...,2,transfer,,102.6,18:39:00,632,Death,Expired,002-70742,270742
50,141556,129238,Female,83,Caucasian,56,82,,165.1,22:54:00,...,2,stepdown/other,,,14:19:00,2459,Death,Expired,002-35104,235104
143,142154,129683,Male,74,Caucasian,73,97,,167.6,17:29:00,...,2,stepdown/other,,,19:32:00,1081,Death,Expired,002-7333,27333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200825,3353087,2742956,Female,66,Caucasian,459,1108,"CHF, congestive heart failure",160.0,21:36:00,...,1,admit,65.30,77.2,03:43:00,3946,Death,Expired,035-12399,3512399
200833,3353117,2742981,Male,85,Caucasian,458,1111,"GI perforation/rupture, surgery for",162.6,17:52:00,...,2,readmit,64.80,77.9,09:55:00,26282,Death,Expired,035-16974,3516974
200835,3353123,2742987,Female,82,Caucasian,458,1104,,,01:57:00,...,1,admit,,56.9,03:03:00,1499,Death,Expired,035-13051,3513051
200844,3353194,2743053,Female,51,Other/Unknown,458,1109,Cardiac arrest (with or without respiratory ar...,170.2,07:17:00,...,1,admit,63.05,65.8,21:57:00,3590,Death,Expired,035-2735,352735


In [48]:
mergedDf = eicuCommonDataDf.merge(
    patientDf[['person_id', 'patientunitstayid', 'unitdischargestatus', 'unitdischargeoffset']],
    how='inner',
    left_on=['person_id', 'visit_occurrence_id'],
    right_on=['person_id', 'patientunitstayid']
)

mergedDf['death_0_30_adm'] = (mergedDf.unitdischargestatus == 'Expired') & (mergedDf.unitdischargeoffset < (30 * 24 * 60))
mergedDf = mergedDf.drop(columns=['patientunitstayid', 'unitdischargestatus', 'unitdischargeoffset'])
mergedDf

Unnamed: 0,person_id,visit_occurrence_id,measurement_date,visit_start_date_adm,death_adm,vitals_systemic_diastolic_avg,vitals_systemic_diastolic_min,vitals_systemic_diastolic_max,vitals_systemic_diastolic_first,vitals_systemic_diastolic_last,...,labs_potassium_level_min,labs_potassium_level_max,labs_potassium_level_first,labs_potassium_level_last,labs_sodium_level_avg,labs_sodium_level_min,labs_sodium_level_max,labs_sodium_level_first,labs_sodium_level_last,death_0_30_adm
0,248364,141515,2014-04-04,2014-04-04,0,47.924731,40.00,74.000,42.00,52.00,...,3.6,4.0,4.0,4.0,132.666667,131.0,135.0,135.0,135.0,False
1,248364,141515,2014-04-05,2014-04-04,0,51.291228,38.00,68.000,52.00,50.00,...,3.3,3.3,3.3,3.3,136.000000,136.0,136.0,136.0,136.0,False
2,248364,141515,2014-04-06,2014-04-04,0,57.565972,46.00,80.000,54.00,58.00,...,4.4,4.4,4.4,4.4,141.000000,141.0,141.0,141.0,141.0,False
3,248364,141515,2014-04-07,2014-04-04,0,57.696864,50.00,86.000,58.00,66.00,...,3.7,3.7,3.7,3.7,143.000000,143.0,143.0,143.0,143.0,False
4,248364,141515,2014-04-08,2014-04-04,0,53.103306,40.00,72.000,64.00,54.00,...,3.6,3.6,3.6,3.6,145.000000,145.0,145.0,145.0,145.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56232,3521842,3352884,2014-01-29,2014-01-28,0,66.934130,68.29,31.000,59.61,79.04,...,4.1,4.1,4.1,4.1,141.000000,141.0,141.0,141.0,141.0,False
56233,3521842,3352884,2014-01-30,2014-01-28,0,50.696327,36.34,72.515,64.47,65.50,...,3.9,3.9,3.9,3.9,139.000000,139.0,139.0,139.0,139.0,False
56234,358073,3352922,2015-06-02,2015-06-01,0,64.023256,46.00,76.000,61.00,63.00,...,3.3,3.8,3.8,3.8,141.000000,141.0,141.0,141.0,141.0,False
56235,358073,3352922,2015-06-03,2015-06-01,0,78.054524,47.93,88.500,72.48,83.62,...,3.2,3.6,3.6,3.6,143.000000,142.0,144.0,144.0,144.0,False


In [49]:
mergedDf.death_0_30_adm.value_counts()

death_0_30_adm
False    49500
True      6737
Name: count, dtype: int64

In [50]:
mergedDf.to_csv(Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/inter_source_validation/data''', '''data_matrix_eicu.csv'''), index=False)