# Analysis

## Set-up data for the experiments

In [3]:
import os
import pandas as pd


datamatrixDf = pd.read_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/final/data_matrix.csv''')
datamatrixDf

Unnamed: 0,person_id,visit_occurrence_id,measurement_date,visit_start_date_adm,death_adm,vitals_systemic_mean_avg,vitals_systemic_diastolic_avg,vitals_systemic_systolic_avg,vitals_respiration_avg,vitals_heartrate_avg,...,labs_Red blood cell count_last,labs_Calcium level_last,labs_MCV - Mean corpuscular volume_last,labs_MCHC - Mean corpuscular haemoglobin concentration_last,labs_MCH - Mean corpuscular haemoglobin_last,labs_White blood cell count_last,labs_Red blood cell distribution width_last,labs_Glucose level_last,labs_Bicarbonate level_last,labs_Anion gap_last
0,248364,141515,2014-04-04,2014-04-04,0,65.344086,47.924731,105.265233,25.763066,89.777003,...,4.37,9.5,90.4,34.9,31.6,3.8,15.5,97.0,21.0,19.0
1,248364,141515,2014-04-05,2014-04-04,0,68.147368,51.291228,99.049123,23.763889,101.937500,...,3.29,7.6,92.1,34.3,31.6,10.0,16.7,154.0,21.0,14.0
2,248364,141515,2014-04-06,2014-04-04,0,78.180556,57.565972,111.343750,23.731707,109.117021,...,3.34,7.9,91.6,33.7,30.8,12.9,17.3,105.0,22.0,13.0
3,248364,141515,2014-04-07,2014-04-04,0,82.724739,57.696864,123.606272,33.772727,79.000000,...,3.14,8.2,91.4,33.8,30.9,11.7,17.0,123.0,22.0,12.0
4,248364,141515,2014-04-08,2014-04-04,0,75.000000,53.103306,114.359504,30.335664,81.479021,...,2.86,8.0,95.1,33.1,31.5,9.9,17.2,116.0,22.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56232,3521842,3352884,2014-01-29,2014-01-28,0,89.946929,66.934130,137.592436,19.909722,100.312500,...,3.37,8.1,92.0,35.0,32.0,14.4,13.7,62.0,19.0,6.0
56233,3521842,3352884,2014-01-30,2014-01-28,0,73.849266,50.696327,127.106241,21.030612,82.459184,...,3.32,7.8,92.0,35.0,33.0,11.4,14.1,98.0,21.0,3.0
56234,358073,3352922,2015-06-02,2015-06-01,0,78.000000,64.023256,97.302326,17.929821,77.197917,...,4.10,8.0,88.0,33.0,29.0,8.7,17.5,143.0,25.0,7.0
56235,358073,3352922,2015-06-03,2015-06-01,0,106.010242,78.054524,160.937190,19.117647,97.704861,...,3.80,8.5,87.0,33.0,29.0,6.9,17.4,159.0,25.0,7.0


## Create and save the random splits

In [12]:
import os
import sys
from pathlib import Path

sys.path.append(os.environ['EICU_EHR_PIPELINE_BASE'] + "/../EHR-ML")

from ehrml.utils import Split


Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/datamatrix').mkdir(parents=True, exist_ok=True)

for i in range(5):
    testData, trainData = Split.split_random(
                                datamatrixDf
                                , idColumns=['person_id', 'visit_occurrence_id']
                                , anchorDateColumn='visit_start_date_adm'
                                , measurementDateColumn='measurement_date'
                                , anchorDateColumnFormat='%Y-%m-%d'
                                , measurementDateColumnFormat='%Y-%m-%d'
                                )
    testData.to_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/datamatrix/random_split_' + str(i + 1) + '_test.csv', index=False)
    trainData.to_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/datamatrix/random_split_' + str(i + 1) + '_train.csv', index=False)

## Create and save the timeseries splits

In [45]:
import os
import sys
from pathlib import Path

sys.path.append(os.environ['EICU_EHR_PIPELINE_BASE'] + "/../EHR-ML")

from ehrml.utils import Split


Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/datamatrix').mkdir(parents=True, exist_ok=True)

datamatrixDf = datamatrixDf.sort_values('visit_start_date_adm')

data_splits = Split.split_timeseries(
                            datamatrixDf
                            , n_splits = 5
                            )

i = 0
for fold_data in data_splits:
    commonDf = fold_data['test'][fold_data['test'].person_id.isin(fold_data['train'].person_id)]
    testDf = fold_data['test'][~fold_data['test'].person_id.isin(fold_data['train'].person_id)]
    trainDf = pd.concat([fold_data['train'], commonDf], ignore_index=True)
    trainDf.to_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/datamatrix/timeseries_split_' + str(i + 1) + '_train.csv', index=False)
    testDf.to_csv(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/datamatrix/timeseries_split_' + str(i + 1) + '_test.csv', index=False)
    i += 1

## Build models for random splits

In [1]:
import os
from pathlib import Path


Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/models').mkdir(parents=True, exist_ok=True)

for i in range(5):
    os.system(
    'cd ' + os.environ['EHR_ML_BASE'] + ';'
    +
    '''.venv/bin/python -m ehrml.ensemble.Build ''' + os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/06_timeseries_split/datamatrix/random_split_''' + str(i + 1) + '''_train.csv -tc "death_adm" -ic "person_id" "visit_occurrence_id" -mdc "measurement_date" -adc "visit_start_date_adm" -wb ''' + str(30) + ''' -wa ''' + str(3) + ''' -sp ''' + os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/06_timeseries_split/models/random_split_''' + str(i + 1) + '''.pkl'''
    )


2024-05-31 12:43:33,740 - EHR-ML - INFO - Parsing command line arguments
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.data_file: /home/yram0006/phd/chapter_2/workspace/eICU-EHR-Pipeline/data/experiments/06_timeseries_split/datamatrix/random_split_1_train.csv
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.id_columns: ['person_id', 'visit_occurrence_id']
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.target_column: death_adm
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.measurement_date_column: measurement_date
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.anchor_date_column: visit_start_date_adm
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.window_before: 30
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.window_after: 3
2024-05-31 12:43:33,741 - EHR-ML - INFO - args.save_path: /home/yram0006/phd/chapter_2/workspace/eICU-EHR-Pipeline/data/experiments/06_timeseries_split/models/random_split_1.pkl
2024-05-31 12:43:33,990 - EHR-ML - INFO - Split data to test and train sets
2024-05-31 1

## Build models for time-series splits

In [None]:
import os
from pathlib import Path


Path(os.environ['EICU_EHR_PIPELINE_BASE'] + '/data/experiments/06_timeseries_split/models').mkdir(parents=True, exist_ok=True)

for i in range(5):
    os.system(
    'cd ' + os.environ['EHR_ML_BASE'] + ';'
    +
    '''.venv/bin/python -m ehrml.ensemble.Build ''' + os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/06_timeseries_split/datamatrix/timeseries_split_''' + str(i + 1) + '''_train.csv -tc "death_adm" -ic "person_id" "visit_occurrence_id" -mdc "measurement_date" -adc "visit_start_date_adm" -wb ''' + str(30) + ''' -wa ''' + str(3) + ''' -sp ''' + os.environ['EICU_EHR_PIPELINE_BASE'] + '''/data/experiments/06_timeseries_split/models/timeseries_split_''' + str(i + 1) + '''.pkl'''
    )
