In [None]:
import psycopg2
from datetime import timedelta
from sqlalchemy import create_engine
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import matplotlib

latex = True

if latex:
    matplotlib.use("pgf")
    matplotlib.rcParams.update({
        "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
    })

In [None]:
WINDOW_LENGTH = 16
MIN_LOS_ICU = 48

# Load Data

### From SQL

In [None]:
# Connect to db
conn = psycopg2.connect(host='localhost', port=5433, dbname='mimic', user='postgres', password='postgres')
cur = conn.cursor() 

# Read vital signs
vitals = pd.read_sql_query(f'SELECT * FROM mimiciii.vital_resampled_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h);', conn)

# Read in labs values
labs = pd.read_sql_query(f'SELECT * FROM mimiciii.lab_resampled_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h);', conn)

# Read demographics
demographics = pd.read_sql_query(f'SELECT * FROM mimiciii.demographics_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h);', conn)

# Close the cursor and connection to so the server can allocate bandwidth to other requests
cur.close()
conn.close()

### From File

In [None]:
data_path = f'data/min{MIN_LOS_ICU:d}h/'

demographics = pd.read_pickle(data_path + f'demographics_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h).pickle')
vitals = pd.read_pickle(data_path + f'vitals_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h).pickle')
labs = pd.read_pickle(data_path + f'labs_{WINDOW_LENGTH:d}h(min{MIN_LOS_ICU:d}h).pickle')

# Analysis

Basic data description and overview

In [None]:
vitals.head(5)

In [None]:
vitals.describe()

In [None]:
labs.head(5)

In [None]:
labs.describe()

In [None]:
demographics.describe()

In [None]:
demographics.info()

Check if there is empty data

In [None]:
demographics.isnull().sum()

In [None]:
labs.isnull().sum().sum()

In [None]:
vitals.isnull().sum().sum()

### Overview of the patient's length of stay per bins

In [None]:
demographics_stats = demographics.copy()
demographics_stats['los_icu_hours'] = demographics_stats['los_icu']*24
demographics_stats['los_icu_bin'] = pd.cut(demographics_stats['los_icu_hours'], bins=[8, 15, 23, 31, 39, 47, 55, 63, 71, 3694.271389], labels=['8-15', '16-23', '24-31', '32-39', '40-47', '48-55', '56-63', '64-71', '>72']) # step 1
#demographics_stats['los_icu_bin'] = pd.cut(demographics_stats['los_icu'], bins=range(8)) # step 1
demographics_stats['los_icu_bin'] = demographics_stats['los_icu_bin'].astype(str)
icu_length_by_class = demographics_stats.groupby("los_icu_bin")['label_death_icu'].value_counts() 
icu_length_by_class.unstack().plot(kind='bar', stacked= True) 
data_imputed = demographics_stats.drop(columns=['los_icu_bin'])
plt.legend(labels=["discharge","death"])
plt.show()

## Length of stay in ICU
Focusing on the demographics "los_icu" we see that there is a big difference between the last quantile (90%) and the maximum value. 

In [None]:
demo_q = demographics_stats['los_icu'].quantile(.9)
print(demo_q)

In [None]:
print("Patients above 90% quantile: \n", demographics_stats[demographics_stats.los_icu > demo_q].label_death_icu.value_counts())
print("Total number of patients: \n", demographics.label_death_icu.value_counts())

## Data distribution for ICU length of stay

Total data distribution until the maximum length of stay

In [None]:
demographics_stats.hist(column='los_icu', bins= 100)

Data representation without the last quantile of data

In [None]:
demographics_8_stats_cut = demographics_stats[demographics_stats.los_icu < demo_q]
demographics_8_stats_cut.describe()

Data length of stay representation per bins 

In [None]:
demographics_8_stats_cut.hist(column='los_icu_hours', bins= 28)

Patient's ICU length of stay per hour

In [None]:
demographics_8_stats_cut.hist(column='los_icu_hours', bins=np.arange(demographics_8_stats_cut.los_icu_hours.max()))
plt.ylabel('Nº of patients / hour')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(a) Patient’s ICU length of stay per hour")
#plt.savefig("Exploratory analysis_a.pdf", format="pdf", bbox_inches="tight")
plt.show()

Patient's ICU length of stay for 72h in groups of 8 h

In [None]:
demographics_8_stats_cut2 = demographics_stats[demographics_stats.los_icu_hours < 72]
demographics_8_stats_cut2.hist(column='los_icu_hours', bins= [8,16,24,32,40,48,56,64,72])
plt.ylabel('Nº of patients / 8h')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(b) Patient’s ICU length of stay for 72h in groups of 8h")
#plt.savefig("Exploratory analysis_b.pdf", format="pdf", bbox_inches="tight")
plt.show()

Death patient's ICU length of stay per 1 h

In [None]:

demographics_8_stats_cut_death = demographics_8_stats_cut2[demographics_8_stats_cut2.label_death_icu==1]
demographics_8_stats_cut_death.hist(column='los_icu_hours', bins=np.arange(72), color='red')
plt.ylabel('Nº death of patients / 1h')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(c) Death patient’s ICU length of stay per 1h")
#plt.savefig("Exploratory analysis_c.pdf", format="pdf", bbox_inches="tight")
plt.show()

Death patient's ICU length of stau for 72 h in groups of 8 h 

In [None]:

demographics_8_stats_cut_death.hist(column='los_icu_hours', bins= [8,16,24,32,40,48,56,64,72], color='red')
plt.ylabel('Nº of death patients / 8h')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(d) Death patient’s ICU length of stay for 72h in groups of 8h")
#plt.savefig("Exploratory analysis_d.pdf", format="pdf", bbox_inches="tight")
plt.show()

Patient's count for a minimum length of stay of 8 h, 16 h, 24 h, 32 h, 40 h, 48 h, 72 h

In [None]:
demographics_16 = demographics_8_stats_cut[demographics_8_stats_cut.los_icu_hours >= 16]
demographics_24 = demographics_8_stats_cut[demographics_8_stats_cut.los_icu_hours >= 24]
demographics_32 = demographics_8_stats_cut[demographics_8_stats_cut.los_icu_hours >= 32]
demographics_40 = demographics_8_stats_cut[demographics_8_stats_cut.los_icu_hours >= 40]
demographics_48 = demographics_8_stats_cut[demographics_8_stats_cut.los_icu_hours >= 48]
demographics_72 = demographics_8_stats_cut[demographics_8_stats_cut.los_icu_hours >= 72]

In [None]:
pat_window = pd.DataFrame()
pat_window['>=8h'] = demographics_8_stats_cut.label_death_icu.value_counts()
pat_window['>=16h'] = demographics_16.label_death_icu.value_counts()
pat_window['>=24h'] = demographics_24.label_death_icu.value_counts()
pat_window['>=32h'] = demographics_32.label_death_icu.value_counts()
pat_window['>=40h'] = demographics_40.label_death_icu.value_counts()
pat_window['>=48h'] = demographics_48.label_death_icu.value_counts()
pat_window['>=72h'] = demographics_72.label_death_icu.value_counts()
pat_window.loc['Total']= pat_window.sum(numeric_only=True, axis=0)
pat_window
