In [None]:
# Import libraries
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
from IPython.display import display, HTML

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [None]:
# Add your Google Cloud Project ID
project_id='hst-eicu-data' #@param {type:"string"}
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id

# Login to Big Query
auth.authenticate_user()

In [None]:
import pandas_gbq

# Helper function to read data from BigQuery into a DataFrame.
def run_query(query, project_id, location=None):
    return pandas_gbq.read_gbq(query, project_id=project_id, location=location, use_bqstorage_api=True)

In [None]:
%%bigquery patients --project $project_id

SELECT *
FROM `physionet-data.eicu_crd.patient`

# 1. Readmission (Jennifer)

In [None]:
# Feature selection based on: https://pmc.ncbi.nlm.nih.gov/articles/instance/6207111/bin/AnnalsATS.201710-787OC_rojas_data_supplement.pdf
# Original paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC6207111/#sec10

# Lab Features
selected_lab_features = [
    "sodium",
    "potassium",
    "creatinine",
    "BUN",
    "glucose",
    "WBC x 1000",
    "platelets x 1000",
    "Hgb",
    "total bilirubin",
    "AST (SGOT)",
    "ALT (SGPT)",
    "PT - INR",
    "PTT",
    "serum ketones",
    "lactate",
    "troponin - I",
    "troponin - T",
    "pH",
    "paO2",
    "paCO2",
    "patientuintstayid"
]
# icd9 codes (all, binary representation)
# medications (binary representation)
categories = [
    'IV Sedation',
    'IV Benzodiazepines',
    'Vasoactive',
    'Inotrope',
    'Nebulized Albuterol',
    'IV Anti-Psychotics',
    'IV Steroids',
    'PO Steroids',
    'PO Immunosuppression',
    'IV Immunosuppression',
    'IV Insulin Drip',
    'IV Diuretic',
    'IV AV-Nodal Blocking Agent',
    'IV Antiarrhythmic',
    'IV Anticoagulation',
    'PO Anticoagulation',
    'Lactulose',
    'IV Antibiotics',
    'PO Antibiotics',
    'PO Anti-Fungal',
    'Other'
]
patient_info = [
    'patientunitstayid',
    'gender',
    'age',
    'ethnicity',
    'uniquepid'
]
vitals = [
    'heartrate',
    'systemicdiastolic',
    'systemicsystolic',
    'respiration',
    "sao2",
    "patientunitstayid"
]

# 2. Mortality (Sukanya & Kevin)

In [None]:
# using predicted values mainly, because i assume in icu when making predictions actual values will be unknown
# from apachePatientResult.csv.gz
patient_results = [
    "predictedICUMortality",
    "actualICUMortality",
    "predictedICULOS",
    "predictedHospitalMortality",
    "predictedHospitalLOS",
    "preopMI",
    "preopCardiacCath",
    "PTCAwithin24h"
]

#from patient
patient_info = [
    'patientunitstayid',
    'gender',
    'age',
    'ethnicity',
    'uniquepid'
]

#from treatment and diagnosis
treatment = [
    "treatmentID",
    "activeUponDischarge"
]

# 3. Progression of Kidney Injury (Enqi)

In [None]:
# From Patient Table
patient_info = [
    'gender',
    'age',
    'ethnicity',
    'admissionheight',
    'hospitaladmitoffset',
    'hospitaldischargeyear',
    'hospitaldischargeoffset',
    'hospitaldischargestatus',
    'unittype',
    'unitvisitnumber',
    'unitstaytype',
    'admissionweight',
    'dischargeweight',
    'unitdischargeoffset',
    'unitdischargestatus'
]

# From Diagnosis Table
diagnosis_info = [
    'activeupondischarge',
    'diagnosisoffset',
    'diagnosisstring',
    'diagnosispriority'
]

# From Medication Table
medication_info = [
    'medicationid',
    'drugorderoffset',
    'drugstartoffset',
    'drugivadmixture',
    'drugordercancelled',
    'drugname',
    'drughiclseqno',
    'dosage',
    'routeadmin',
    'frequency',
    'loadingdose',
    'prn',
    'drugstopoffset',
    'gtc'
]

# From Labs Table
labs_info = ['labname']


Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   labname                  1311 non-null   object
 1   gender                   1311 non-null   object
 2   age                      1311 non-null   float64
 3   ethnicity                1299 non-null   object
 4   admissionheight          1293 non-null   float64
 5   hospitaladmitoffset      1311 non-null   int64
 6   hospitaldischargeyear    1311 non-null   int64
 7   hospitaldischargeoffset  1311 non-null   int64
 8   hospitaldischargestatus  1305 non-null   object
 9   unittype                 1311 non-null   object
 10  unitvisitnumber          1311 non-null   int64
 11  unitstaytype             1311 non-null   object
 12  admissionweight          1061 non-null   float64
 13  dischargeweight          890 non-null    float64
 14  unitdischargeoffset      1311 non-null   int64
 15  unitdischargestatus      1311 non-null   object
 16  activeupondischarge      1311 non-null   bool
 17  diagnosisoffset          1311 non-null   int64
 18  diagnosisstring          1311 non-null   object
 19  diagnosispriority        1311 non-null   object
 20  medicationid             1311 non-null   int64
 21  drugorderoffset          1311 non-null   int64
 22  drugstartoffset          1311 non-null   int64
 23  drugivadmixture          1311 non-null   object
 24  drugordercancelled       1311 non-null   object
 25  drugname                 910 non-null    object
 26  drughiclseqno            1040 non-null   float64
 27  dosage                   1149 non-null   object
 28  routeadmin               1311 non-null   object
 29  frequency                1112 non-null   object
 30  loadingdose              0 non-null      object
 31  prn                      1311 non-null   object
 32  drugstopoffset           1311 non-null   int64
 33  gtc                      1311 non-null   int64
dtypes: bool(1), float64(5), int64(11), object(17)

# 4. Failed Extubation (Nika)

In [None]:
# Feature selection based on this paper:Development and Validation of a Machine-Learning Model for Prediction of Extubation Failure in Intensive Care Units. Front Med (Lausanne)doi: 10.3389/fmed.2021.676343. PMID: 34079812; PMCID: PMC8165178.

# Patient information
patient_info = [
    "age",
    "body_mass_index",
    "stroke"
]

# Vital signs and measurements
vitals = [
    "heart_rate",
    "respiratory_rate",
    "mean_arterial_pressure",
    "peripheral_oxygen_saturation",
    "temperature",
    "pH"
]

# Respiratory related data
respiratory_features = [
    "central_venous_pressure",
    "tidal_volume",
    "positive_end_expiratory_pressure",
    "mean_airway_pressure",
    "pressure_support_ventilation_level",
    "mechanical_ventilation_durations",
    "spontaneous_breathing_trial_success_times"
]

# Treatment and fluid data
treatment_data = [
    "urine_output",
    "crystalloid_amount",
    "antibiotic_types"
]

selected_features = patient_info + vitals + respiratory_features + treatment_data
X = data[selected_features]


# 5. Heart Failure (Adele)

In [None]:
#########################################################################################################################################################
# STEP 1: SELECT OUTCOMES BASED ON THE FOLLOWING CRITERIA:
# 1. We want outcomes that have a large proportion of patients
# 2. We want events that happen after 24h from admission and we should not use variables recorded after 24h to make predictions on these outcomes
#########################################################################################################################################################

# Condition: Diagnosed at admission, but small proportion of the patients will become heart failure during hospitalisation (but this is a small population)

# We need 5 events that happen after 24h from admission: 0-24h (extract variables in here), OR >= 24h (outcome variables that are derived after 24h)

# What are typical events that happen after 24h from admission?

# 1. AKI (good outcome, but difficult to diagnose AKI because in heart failure patients, we used diuretics to treat heart failure patients. When you use diuretics, it decreases eGFR, so sometimes it is difficult to distinguish the result as a result of heart failure or due to AKI)
# 1.1. Dialysis (instead, we can predict whether they will be on dialysis)
# 2. Infection (common infections are UTI and pneumonia, if its difficult to extract from the ICD codes, we can just select ONE of these are the target)
# 2.1. Pneumonia is related to intubation, so we would need to decide which code to focus on
# 3. Mortality
# 4. Length of stay (duration of hospitalisation)
# 5. Readmission
# 5.1. The problem with readmission is that it is not an outcome we want to predict at the time of admission (i.e. within the first 24h, you are not interested in predicting whether they will be readmitted), instead
# we are interested at the time of discharge, whether they will be readmitted. If its at the time of discharge, then we are including variables AFTER the 24h window to make that prediction, which we initially said
# we need to exclude.
# 6. Thrombosis (Very important clinical outcome for all patients, but the event ratio is very small, less common / rare)
# 6.1. In hospital we assess the risk of the patient who will develop thrombosis.
# 7. Failed Extubation (good, but the proportion of patients are very small)


# The outcomes that are likely to have the largest proportion of patients are:

# 1. AKI or Dialysis (depending on literature / better target)
# 2. Infection (depending on: pneumonia, pneumonia derived from intubation, UTI)
# 3. Mortality (in-hospital all-cause mortality)
# 4. Length of stay


# 5. ______ (pressure ulcer? need something with a good proportion of patients)

#########################################################################################################################################################
# STEP 2: FEATURE SELECTION FOR EACH OUTCOME
#########################################################################################################################################################

#########################################################################################################################################################
# STEP 3: WE GET THE SET OF UNIQUE TABLES REQUIRED FOR THE UNION OF ALL FEATURES
# Process each table individually and aggregate each table individually first (labs, diagnosis, vital signs)
#########################################################################################################################################################

#########################################################################################################################################################
# STEP 4: MERGE THE FEATURES ACROSS THE NEWLY AGGREGATED TABLES TO CREATE FINAL DATASET
#########################################################################################################################################################

#########################################################################################################################################################
# STEP 5: SPLIT THE DATA (80-20, subject to size of dataset / patient pool)
#########################################################################################################################################################

# NOTE: Since features may have NA, even if we are doing a random split, we should do a sanity check to make sure that the proportion of NAs are evenly distributed

- Every patient has every feature, but it is fine for a feature to have a missing value

- Patient 1: Feature A, B, C, D, (every feature)
- Patient 2: Feature A, B, C, D, (every feature)
- Patient 3: Feature A, B, C, D, (every feature)
- Patient 4: Feature A, B, C, D, (every feature)

#########################################################################################################################################################
# STEP 6: TASK SPECIFIC PROCESSING OF DATA
# In outcome 1 or 2 we may require some additional processing / selection criteria
# For example, for AKI or Dialysis, we would exclude patients that are in end-stage of renal failure, even though that patient will still be relevant in
# something like mortality prediction
#########################################################################################################################################################

After doing some additional post-processing, we will have some patients for some tasks become excluded (that were previously included)

For example for AKI, we may only have (Patient 2 is lost, and Feature B is not needed)

- Patient 1: Feature A, C, D, (subset of features)
- Patient 3: Feature A, C, D, (subset of features)
- Patient 4: Feature A, C, D, (subset of features)

#########################################################################################################################################################
# STEP 7: TRAIN THE MODEL (DIFFERENT MODELS)
#########################################################################################################################################################

# Everyone was going to just train the "best" model for each task: One task might use XGBoost, Neural Network

# Note: We might need to be mindful of multiple hypothesis testing

#########################################################################################################################################################
# STEP 8: WE NOW CREATE A NEW COLUMN FOR THE MISCLASSIFIED INSTANCES AND APPLY STATISTICAL ANALYSIS 1
#########################################################################################################################################################

We go back to the big table, where every feature is present (with missing values) and with missing outcomes, because some patients were not used on
certain tasks, but now, we add 5 new columns, representing each task, and we report whether it was classified correctly, misclassified, or NA

- Patient 1: Feature A, B, C, D, (every feature) | AKI (1,0,NA) | Mortality (1,0,NA) | Length of Stay (1,0,NA) | _____ | ??? (1,0,NA)
- Patient 2: Feature A, B, C, D, (every feature)
- Patient 3: Feature A, B, C, D, (every feature)
- Patient 4: Feature A, B, C, D, (every feature)

- Using multiple imputation we can create several datasets
- We can evaluate the table using MANOVA (we can use other features like race, gender, insurance, missing ratio of the variables to train a model on the misclassification)
- We could use things like the APACHE score, so that we get an integrated score of the patient's severity.
- We could also include the interaction of variables, sex, race or other terms.

Using statistical analysis, we can do a quantitative analysis of misclassification.


#########################################################################################################################################################
# STEP 9: ANALYSIS 2: CAN WE CREATE A SCORE?
#########################################################################################################################################################

The next step is to say, can we combine the outcomes to generate a score. HOWEVER, we have a problem due to the NAs, as some patients will not have a particular outcome.

- Patient 1: Feature A | Feature B | Feature C | Feature D | Outcome AKI (1,0,NA) | Outcome Mortality (1,0,NA) | Outcome Length of Stay (1,0,NA) | Outcome Infection (1,0,NA)
- Patient 2: Feature A, B, C, D, (every feature)
- Patient 3: Feature A, B, C, D, (every feature)
- Patient 4: Feature A, B, C, D, (every feature)


#########################################################################################################################################################
# STEP X: FOLLOW UP ANALYSIS
#########################################################################################################################################################

# Look at the Misclassification (General) vs Misclassification (FP) vs Misclassification (FN)
# Including more features or less features could change the misclassification for some patients



