In [None]:
import psycopg2
from datetime import timedelta
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import matplotlib

latex = False

if latex:
    matplotlib.use("pgf")
    matplotlib.rcParams.update({
        "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
    })

In [None]:
MIN_LOS_ICU = 24

# Load Data

### From SQL

In [None]:
# Connect to db
conn = psycopg2.connect(host='localhost', port=5433, dbname='mimic', user='postgres', password='postgres')
cur = conn.cursor() 

# Read vital signs
vitals = pd.read_sql_query(f'SELECT * FROM mimiciii.vital_resampled_min{MIN_LOS_ICU:d}h;', conn)

# Read in labs values
labs = pd.read_sql_query(f'SELECT * FROM mimiciii.lab_resampled_min{MIN_LOS_ICU:d}h;', conn)

# Read demographics
demographics = pd.read_sql_query(f'SELECT * FROM mimiciii.demographics_min{MIN_LOS_ICU:d}h;', conn)

# Close the cursor and connection to so the server can allocate bandwidth to other requests
cur.close()
conn.close()

### From File

In [None]:
data_path = f'data/min{MIN_LOS_ICU:d}h/'

demographics = pd.read_pickle(data_path + f'demographics_min{MIN_LOS_ICU:d}h.pickle')
vitals = pd.read_pickle(data_path + f'vitals_min{MIN_LOS_ICU:d}h.pickle')
labs = pd.read_pickle(data_path + f'labs_min{MIN_LOS_ICU:d}h.pickle')

# Preprocess data:

In [None]:
demographics['los_icu_hours'] =     demographics['los_icu'] * 24

demographics['los_icu_bin'] =       pd.cut(demographics['los_icu_hours'],
                                        bins=[8, 15, 23, 31, 39, 47, 55, 63, 71, max(demographics['los_icu_hours'])],
                                        labels=['8-15', '16-23', '24-31', '32-39', '40-47', '48-55', '56-63', '64-71', '>72']
                                    ).astype(str)

demographics['admission_age_bin'] = pd.cut(demographics['admission_age'],
                                        bins=[0, 29, 59, 89, max(demographics['admission_age'])],
                                        labels=['0-29', '30-59', '60-89', '>90']
                                    ).astype(str)

# Analysis

## Basic data description and overview

Describe vitals

In [None]:
vitals.head(5)

In [None]:
vitals.describe()

Describe labs

In [None]:
labs.head(5)

In [None]:
labs.describe()

Describe demographics

In [None]:
demographics[['icustay_id','intime','outtime','label_death_icu']].head(5)

In [None]:
demographics.describe()

Check if there is empty data

In [None]:
demographics.isnull().sum()

In [None]:
labs.isnull().sum().sum()

In [None]:
vitals.isnull().sum().sum()

## Check timings of measurements:

In [None]:
timings = demographics[['icustay_id', 'intime', 'outtime']].copy()
timings = timings.sort_values('icustay_id').set_index('icustay_id')

Check labs

In [None]:
labs_grouped = labs[['icustay_id', 'charttime']].copy().groupby('icustay_id')
labs_grouped.sort = True

# Calculate offset between first lab and intime:
timings['offset_first_lab'] = labs_grouped.charttime.min()
timings['offset_first_lab'] = (timings['offset_first_lab'] - timings['intime']) / timedelta(hours=1)

# Calculate offset between last lab and outtime:
timings['offset_last_lab'] = labs_grouped.charttime.max()
timings['offset_last_lab'] = (timings['offset_last_lab'] - timings['outtime']) / timedelta(hours=1)

# Calculate number of labs preceding intime:
timings['n_early_labs'] = [(labs_grouped.get_group(i).charttime.to_numpy() < timings.loc[[i]].intime.to_numpy()).sum() for i in timings.index]

# Calculate number of labs after outtime:
timings['n_late_labs'] = [(labs_grouped.get_group(i).charttime.to_numpy() > timings.loc[[i]].outtime.to_numpy()).sum() for i in timings.index]

# Calculate number of labs in between intime and outtime:
timings['n_labs']  = [len(labs_grouped.get_group(i)) for i in timings.index]
timings['n_labs'] -= timings['n_early_labs']
timings['n_labs'] -= timings['n_late_labs']

del labs_grouped

Check vitals

In [None]:
vitals_grouped = vitals[['icustay_id', 'charttime']].copy().groupby('icustay_id')
vitals_grouped.sort = True

# Calculate offset between first vital and intime:
timings['offset_first_vital'] = vitals_grouped.charttime.min()
timings['offset_first_vital'] = (timings['offset_first_vital'] - timings['intime']) / timedelta(hours=1)

# Calculate offset between last vital and outtime:
timings['offset_last_vital'] = vitals_grouped.charttime.max()
timings['offset_last_vital'] = (timings['offset_last_vital'] - timings['outtime']) / timedelta(hours=1)

# Calculate number of vitals preceding intime:
timings['n_early_vitals'] = [(vitals_grouped.get_group(i).charttime.to_numpy() < timings.loc[[i]].intime.to_numpy()).sum() for i in timings.index]

# Calculate number of vitals after outtime:
timings['n_late_vitals'] = [(vitals_grouped.get_group(i).charttime.to_numpy() > timings.loc[[i]].outtime.to_numpy()).sum() for i in timings.index]

# Calculate number of vitals in between intime and outtime:
timings['n_vitals']  = [len(vitals_grouped.get_group(i)) for i in timings.index]
timings['n_vitals'] -= timings['n_early_vitals']
timings['n_vitals'] -= timings['n_late_vitals']

del vitals_grouped

Describe timings

In [None]:
print(
    'All labs and vitals start at the same time: ',
    (timings['offset_first_vital'] == timings['offset_first_lab']).to_numpy().all()
)
timings.describe()

In [None]:
timings.head(5)

### Overview of the patient's length of stay per bins

In [None]:
icu_length_by_class = demographics.groupby("los_icu_bin")['label_death_icu'].value_counts() 
icu_length_by_class.unstack().plot(kind='bar', stacked= True) 
data_imputed = demographics.drop(columns=['los_icu_bin'])
plt.legend(labels=["discharge","death"])
plt.show()

## Length of stay in ICU
Focusing on the demographics "los_icu" we see that there is a big difference between the last quantile (90%) and the maximum value. 

In [None]:
demo_q = demographics['los_icu'].quantile(.9)
print(f"90% Quantile length of stay: {demo_q*24.:.2f}h")

In [None]:
print("Patients above 90% quantile: \n", demographics[demographics.los_icu > demo_q].label_death_icu.value_counts())
print("Total number of patients: \n", demographics.label_death_icu.value_counts())

## Data distribution for ICU length of stay

Patient's ICU length of stay per hour

In [None]:
demographics.hist(column='los_icu_hours', bins=np.arange(demographics.los_icu_hours.max()))
plt.ylabel('Nº of patients / hour')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(a) Patient’s ICU length of stay per hour")
#plt.savefig("Exploratory analysis_a.pdf", format="pdf", bbox_inches="tight")
plt.show()

Patient's ICU length of stay for 72h in groups of 8 h

In [None]:
demographics.hist(column='los_icu_hours', bins=range(8, MIN_LOS_ICU + 49, 8))
plt.ylabel('Nº of patients / 8h')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(b) Patient’s ICU length of stay for 72h in groups of 8h")
#plt.savefig("Exploratory analysis_b.pdf", format="pdf", bbox_inches="tight")
plt.show()

Death patient's ICU length of stay per 1 h

In [None]:
demographics_death = demographics[demographics.label_death_icu==1]
demographics_death.hist(column='los_icu_hours', bins=np.arange(72), color='red')
plt.ylabel('Nº death of patients / 1h')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(c) Death patient’s ICU length of stay per 1h")
#plt.savefig("Exploratory analysis_c.pdf", format="pdf", bbox_inches="tight")
plt.show()

Death patient's ICU length of stau for 72 h in groups of 8 h 

In [None]:
demographics_death.hist(column='los_icu_hours', bins=range(8, MIN_LOS_ICU + 49, 8), color='red')
plt.ylabel('Nº of death patients / 8h')
plt.xlabel('Length of ICU stay (hours)')
plt.title("(d) Death patient’s ICU length of stay for 72h in groups of 8h")
#plt.savefig("Exploratory analysis_d.pdf", format="pdf", bbox_inches="tight")
plt.show()

Patient's count for a minimum length of stay of 8 h, 16 h, 24 h, 32 h, 40 h, 48 h, 72 h

In [None]:
pat_window = pd.DataFrame()

for t in range(8, MIN_LOS_ICU + 48, 8):
    pat_window[f'>={t:d}h'] =  demographics[demographics.los_icu_hours >= t].label_death_icu.value_counts()

pat_window.loc['Total']= pat_window.sum(numeric_only=True, axis=0)
pat_window


## Print patient distribution:

In [None]:
f = 100. / len(demographics.icustay_id.unique())

def print_distribution(column=None):
    # Actual printing happens here:
    def __print_internal(data):
        if len(data) > 0:
            label_counts = data['label_death_icu'].value_counts()
            latex = ""

            if 1 in label_counts:
                print(f"  Number of ICU deaths:       {label_counts[1]:6d} ({label_counts[1]/label_counts.sum()*100.:5.1f}%)")
                latex += f"& ${label_counts[1]:6,d}$ & ${label_counts[1]*f:5.1f}\%$ "
            else:
                print(f"  Number of ICU deaths:            0 (  0.0%)")
                latex += f"& $     0$ & $  0.0\%$ "

            
            if 0 in label_counts:
                print(f"  Number of ICU discharges:   {label_counts[0]:6d} ({label_counts[0]/label_counts.sum()*100.:5.1f}%)")
                latex += f"& ${label_counts[0]:6,d}$ & ${label_counts[0]*f:5.1f}\%$ "
            else:
                print(f"  Number of ICU discharges:        0 (  0.0%)")
                latex += f"& $     0$ & $  0.0\%$ "

            latex += f"& ${label_counts.sum():6,d}$ "
            print(f"Latex: '{latex:s}'")

    # Print overall class distribution if column is None:
    if column == None:
        __print_internal(demographics)

    # Print class distribution of column:
    else:
        for key in demographics[column].unique():
            print(f"'{key:s}':")
            __print_internal(demographics[demographics[column]==key])
            print()

In [None]:
print_distribution()

Print gender distribution:

In [None]:
print_distribution('gender')

Print ethnicity distribution:

In [None]:
print_distribution('ethnicity_grouped')

Print age distribution:

In [None]:
print_distribution('admission_age_bin')