In [None]:
import matplotlib
if 'init_done' in globals():
    matplotlib.use("pgf")
    matplotlib.rcParams.update({
        "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
    })
import matplotlib.pyplot as plt

import psycopg2
from datetime import timedelta
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

init_done = True

In [None]:
MIN_LOS_ICU = 24
WINDOW_LENGTH = 24

# Load Data

### From SQL

In [None]:
# Connect to db
conn = psycopg2.connect(host='localhost', port=5433, dbname='mimic', user='postgres', password='postgres')
cur = conn.cursor() 

# Read vital signs
vitals = pd.read_sql_query(f'SELECT * FROM mimiciii.vital_resampled_min{MIN_LOS_ICU:d}h;', conn)

# Read in labs values
labs = pd.read_sql_query(f'SELECT * FROM mimiciii.lab_resampled_min{MIN_LOS_ICU:d}h;', conn)

# Read demographics
demographics = pd.read_sql_query(f'SELECT * FROM mimiciii.demographics_min{MIN_LOS_ICU:d}h;', conn)

# Close the cursor and connection to so the server can allocate bandwidth to other requests
cur.close()
conn.close()

### From File

In [None]:
demographics = pd.read_pickle(f'demographics_min{MIN_LOS_ICU:d}h.pickle')
vitals = pd.read_pickle(f'vitals_min{MIN_LOS_ICU:d}h.pickle')
labs = pd.read_pickle(f'labs_min{MIN_LOS_ICU:d}h.pickle')

# Process Data

In [None]:
print("Number of ICU stays: ", demographics['icustay_id'].nunique())
print("Number of ICU stays in vitals: ", vitals['icustay_id'].nunique())
print("Number of ICU stays in labs: ", labs['icustay_id'].nunique())
print("Number of ICU deaths: ", demographics['label_death_icu'].value_counts()[1])

## Use $\Delta t_{pred}$ of maximum $48h$

In [None]:
# Cut 10% patients with longest stay (obsolete)
#max_los_icu = demographics['los_icu'].quantile(q=.9)
max_los_icu = float(MIN_LOS_ICU + 48) / 24.0
print(f"Maximum length of stay: {max_los_icu:.0f}d")

demographics_cut = demographics[demographics['los_icu'] < max_los_icu].copy()
print(f"Remaining patients: {100. * demographics_cut['icustay_id'].nunique() / demographics['icustay_id'].nunique():.1f}%")

In [None]:
cut_icustay_ids = pd.DataFrame(demographics_cut['icustay_id'].unique(), columns=['icustay_id'])
print("Number of ICU stays: ", cut_icustay_ids['icustay_id'].count())

vitals_cut = vitals.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in vitals_cut: ", vitals_cut['icustay_id'].nunique())

labs_cut = labs.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in labs_cut: ", labs_cut['icustay_id'].nunique())

print("Number of ICU deaths: ", demographics_cut['label_death_icu'].value_counts()[1])

# Windowing & Labeling

## Windowing
Take first WINDOW_LENGTH hours from each patient

In [None]:
delta_t_data = timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
demographics_windowed = demographics_cut.copy()
demographics_windowed['predtime'] = demographics_windowed.intime + delta_t_data
demographics_windowed['delta_t_pred'] = demographics_windowed.outtime - demographics_windowed.predtime

demographics_windowed[['subject_id', 'icustay_id', 'intime', 'predtime', 'delta_t_pred']].head(5)

In [None]:
vitals_windowed = vitals_cut.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
vitals_windowed = vitals_windowed[vitals_windowed.charttime < vitals_windowed.predtime]
print("Number of ICU stays in vitals_windowed: ", vitals_windowed['icustay_id'].nunique())

labs_windowed = labs_cut.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
labs_windowed = labs_windowed[labs_windowed.charttime < labs_windowed.predtime]
print("Number of ICU stays in labs_windowed: ", labs_windowed['icustay_id'].nunique())

windowed_icustay_ids = pd.DataFrame(pd.concat([vitals_windowed['icustay_id'], labs_windowed['icustay_id']]).unique(), columns=['icustay_id'])
demographics_windowed = demographics_windowed.merge(windowed_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays: ", demographics_windowed['icustay_id'].nunique())
print("Number of ICU deaths: ", demographics_windowed['label_death_icu'].value_counts()[1])

In [None]:
print("Max ∆t_pred: ", demographics_windowed['delta_t_pred'].max().total_seconds() / 3600 / 24)
print("Mean ∆t_pred: ", demographics_windowed['delta_t_pred'].mean().total_seconds() / 3600 / 24)
print("Min ∆t_pred: ", demographics_windowed['delta_t_pred'].min().total_seconds() / 3600 / 24)

## Labeling
Patients who died during their ICU stay were identified by the deathtime variable in
the admission table of MIMIC-III.

Patients who died during their stay in the ICU were included in the positive group (output = 1), and patients who survived to discharge were included in the negative group (output = 0).

In [None]:
vitals_labeled = vitals_windowed.merge(demographics_windowed[['icustay_id', 'label_death_icu']], on='icustay_id', how='right')
print("Number of ICU stays in vitals_labeled: ", vitals_labeled['icustay_id'].nunique())

labs_labeled = labs_windowed.merge(demographics_windowed[['icustay_id', 'label_death_icu']], on='icustay_id', how='right')
print("Number of ICU stays in labs_labeled: ", labs_labeled['icustay_id'].nunique())

print("Number of ICU stays: ", demographics_windowed['icustay_id'].nunique())

In [None]:
l = demographics_windowed["label_death_icu"]
print('label = 0:', l[l == 0].count())
print('label = 1:', l[l == 1].count())

fig, ax = plt.subplots(figsize=(5, 2.7))
ax.bar(x = [0, 1], height = [l[l == 0].count(), l[l == 1].count()])
ax.set_xticks([0, 1])
plt.show()

## Some Statistical Information

Vital Signs

In [None]:
seconds = np.array([t.total_seconds() for t in vitals_labeled[vitals_labeled["label_death_icu"]==1].delta_t_pred])

hours = seconds/3600
mean_hours = np.mean(hours)
print('hours: ', mean_hours)

days = hours/24
mean_days = np.mean(days)
print('days: ', mean_days)

fig, ax = plt.subplots(figsize=(5, 2.7))
ax.hist(days, bins=10)
plt.show()

In [None]:
vitals_labeled.describe()

Lab Measurements

In [None]:
seconds = np.array([t.total_seconds() for t in labs_labeled[labs_labeled["label_death_icu"]==1].delta_t_pred])

hours = seconds/3600
mean_hours = np.mean(hours)
print('hours: ', mean_hours)

days = hours/24
mean_days = np.mean(days)
print('days: ', mean_days)

fig, ax = plt.subplots(figsize=(5, 2.7))
ax.hist(days, bins=10)
plt.show()

In [None]:
labs_labeled.describe()

# Save Data

## Write Final Datasets into Postgres

In [None]:
engine = create_engine('postgresql://postgres:postgres@localhost:5433/mimic')

vitals_labeled.to_sql(f'vitals_labeled_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h)', engine, if_exists='replace')
labs_windowed.to_sql(f'labs_labeled_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h)', engine, if_exists='replace')

## Write Final Datasets into Pickle files (alternative to postgres)

In [None]:
vitals_labeled.to_pickle(f'vitals_labeled_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h).pickle')
labs_labeled.to_pickle(f'labs_labeled_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h).pickle')