# Changes Summary by team
- No changes made to code
- Only changes are newlines / spacings / cell locations / comments etc. 
- Some errors found and variables calculated that are not used, but nothing that affects results

In [1]:
import pandas as pd
import os
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 10
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']
GPU               = '2'

os.environ['CUDA_VISIBLE_DEVICES'] = GPU
np.random.seed(SEED)

In [3]:
MIMIC_EXTRACT_DATA = "data/all_hourly_data.h5"

data_full_lvl2 = pd.read_hdf(MIMIC_EXTRACT_DATA, "vitals_labs")
data_full_raw  = pd.read_hdf(MIMIC_EXTRACT_DATA, "vitals_labs")
statics        = pd.read_hdf(MIMIC_EXTRACT_DATA, 'patients')

In [4]:
"""
Method takes in a df and imputes the missing data for the 'mean' columns.
Also adds a columns specifying the time since the last recording
"""

def simple_imputer(df):
    
    # Create an object (idx) to more easily perform multi-index slicing.
    idx = pd.IndexSlice
    df = df.copy()

    # Removes unwanted column name levels 
    if len(df.columns.names) > 2: 
        df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))

    # Only keep those columns with mean and count values
    df_out = df.loc[:, idx[:, ['mean', 'count']]]

    # Group the data by subject, admission and icustay id and find the mean of the mean columns
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()

    # Overwrites the values in the mean columns 
    df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)

    # Sets the count columns as 1.0 if value > 0, o.p otherwise and sets as type float 
    # - Renames the column name to mask as no longer a count
    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)

    # Determines how long between measurements, based on the available 'mask'
    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    # Adds the 'time_since_measured' information to the df
    # - Sets any missing 'time_since_measured' data to 100
    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)

    df_out.sort_index(axis=1, inplace=True)
    
    return df_out

In [5]:
# Extracts and calculated the data for the 4 desired outputs ('mort_hosp', 'mort_icu', 'los_3', 'los_7')
# - Only select rows where there is at least 'WINDOW_SIZE + GAP_TIME' hours worth of data
# - Create two new columns for ICU stay longer than 3 and 7 hours and remove original
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys['los_7'] = Ys['los_icu'] > 7
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mort_hosp,mort_icu,los_3,los_7
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,145834,211552,0.0,0.0,1.0,0.0
4,185777,294638,0.0,0.0,0.0,0.0
6,107064,228232,0.0,0.0,1.0,0.0
9,150750,220597,1.0,1.0,1.0,0.0
11,194540,229441,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
99973,150202,275083,0.0,0.0,0.0,0.0
99982,151454,221194,0.0,0.0,1.0,1.0
99991,151118,226241,0.0,0.0,1.0,0.0
99992,197084,242052,0.0,0.0,0.0,0.0


In [6]:
# Get the data
# - Only keeps data which has same 'icustay_id' as the 'Ys' calculated above
# - Only keeps data from the first 24 hours (WINDOW_SIZE)
lvl2, raw = [df[
    (df.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) &
    (df.index.get_level_values('hours_in') < WINDOW_SIZE)
] for df in (data_full_lvl2, data_full_raw)]

# Drops the level 2 column headings from raw
raw.columns = raw.columns.droplevel(level=['LEVEL2'])


In [7]:
# Gets the 'subject_id' from the two data df and the output df
# - Tests to ensure that they are all the same
lvl2_subj_idx, raw_subj_idx, Ys_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, raw, Ys)]
lvl2_subjects = set(lvl2_subj_idx)
assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"
assert lvl2_subjects == set(raw_subj_idx), "Subject ID pools differ!"

In [8]:
# Sets the breakdown of train vs development vs test sizes
train_frac, dev_frac, test_frac = 0.7, 0.1, 0.2

# Uses the sizes to select the subjects for the three sets (train, dev, test) 
np.random.seed(SEED)
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)
N_train, N_dev, N_test = int(train_frac * N), int(dev_frac * N), int(test_frac * N)
train_subj = subjects[:N_train]
dev_subj   = subjects[N_train:N_train + N_dev]
test_subj  = subjects[N_train+N_dev:]

In [9]:
# Uses the selected subject ID breakdown to extract the data for train, dev and test
# from the data frames
[(lvl2_train, lvl2_dev, lvl2_test), (raw_train, raw_dev, raw_test), (Ys_train, Ys_dev, Ys_test)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
    for df in (lvl2, raw, Ys)
]

In [10]:
# Normalises the 'mean' columns of the data
# - Create an object (idx) to more easily perform multi-index slicing.
# - Calculates the mean and std of each column which has "mean" in the column heading
# - uses these values to normalise the same columns
idx = pd.IndexSlice
lvl2_means = lvl2_train.loc[:, idx[:,'mean']].mean(axis=0)
lvl2_stds  = lvl2_train.loc[:, idx[:,'mean']].std(axis=0)

lvl2_train.loc[:, idx[:,'mean']] = (lvl2_train.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_dev.loc[  :, idx[:,'mean']] = (lvl2_dev.loc[  :, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_test.loc[ :, idx[:,'mean']] = (lvl2_test.loc[ :, idx[:,'mean']] - lvl2_means)/lvl2_stds

In [11]:

# Method takes in a df and imputes the missing data for the 'mean' columns.
# - Also adds a columns specifying the time since the last recording
# - NOTE: This time since last recording is not correct - but not used again later so has no bearing
lvl2_train, lvl2_dev, lvl2_test = [
    simple_imputer(df) for df in (lvl2_train, lvl2_dev, lvl2_test)
]

# Pivots the table so that there is one row per 'subject_id', 'hadm_id', 'icustay_id'
# - Therefore in each row there are now:
#    - 24 columns of mask (1 for each hour)
#    - 24 columns of mean value
#    - 24 columns of time_since_measured
lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test = [
    df.pivot_table(index=['subject_id', 'hadm_id', 'icustay_id'], columns=['hours_in']) for df in (
       lvl2_train, lvl2_dev, lvl2_test
    )
]

# Runs a test to ensure no missing values remain
for df in lvl2_train, lvl2_dev, lvl2_test: assert not df.isnull().any().any()



In [12]:
# Gets the output df for train, dev and test
# - NOTE: This has actually already been done in cell above, but going to leave it in as does no harm
[(Ys_train, Ys_dev, Ys_test)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
    for df in (Ys,)
]

In [13]:
# Save all required data
pd.to_pickle(lvl2_train, "data/lvl2_imputer_train.pkl")
pd.to_pickle(lvl2_dev,   "data/lvl2_imputer_dev.pkl")
pd.to_pickle(lvl2_test,  "data/lvl2_imputer_test.pkl")

pd.to_pickle(Ys,       "data/Ys.pkl")
pd.to_pickle(Ys_train, "data/Ys_train.pkl")
pd.to_pickle(Ys_dev,   "data/Ys_dev.pkl")
pd.to_pickle(Ys_test,  "data/Ys_test.pkl")