### Import Pre-requisites

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV


### Read MIMIC-IV dataset

In [None]:
df_icu = pd.read_csv('../mimic_iv_data/icustays.csv.gz', compression='gzip')
df_patients = pd.read_csv('../mimic_iv_data/patients.csv.gz', compression='gzip')
df_chart_events = pd.read_csv('../mimic_iv_data/chartevents.csv.gz', compression='gzip')
df_d_items = pd.read_csv('../mimic_iv_data/d_items.csv.gz', compression='gzip')

In [None]:
df_lab_events = pd.read_csv('../mimic_iv_data/labevents.csv.gz', compression='gzip')
df_d_labitems = pd.read_csv('../mimic_iv_data/d_labitems.csv.gz', compression='gzip')

### Cleanup and Pre-process data

- Get age, gender from the patients table

In [None]:
df_patients.info()
df_patients_f = df_patients

In [None]:
# Select subject_id, gender, anchor_age, dod
selected_columns = ['subject_id', 'gender', 'anchor_age', 'dod']
df_patients_f = df_patients.loc[:, selected_columns]
df_patients_f.head()

In [None]:
# Replace dod with disceased, NaN with 0, and not NaN with 1
df_patients_f['dod'] = df_patients_f['dod'].notna().astype(int)
# Rename the columns
df_patients_f.rename(columns={'dod': 'deceased'}, inplace=True)
# Change gender M to 0 and F to 1
df_patients_f['gender'] = df_patients_f['gender'].replace({'M': 0, 'F': 1})
df_patients_f.head()

- Plot the age histogram

In [None]:
# Create histogram of anchor_age
plt.figure(figsize=(10, 6))
sns.histplot(df_patients_f['anchor_age'], bins=30, kde=True)
plt.title('Distribution of Anchor Age')
plt.xlabel('Anchor Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Include only patients with anchor_age > 0
df_patients_f = df_patients_f[df_patients_f['anchor_age'] > 0]

- Find number of patients who are deceased

In [None]:
# Find number of patients who are deceased
deceased_count = df_patients_f['deceased'].value_counts()
print(deceased_count)

- Find the itemids for the chart events (Heart Rate, Blood Pressure, SpO2, Temperature)

In [None]:
df_d_items.info()
df_d_items.head()
# Find item for Heart Rate, Blood Pressure, SpO2, Temperature
df_d_items[df_d_items['label'].str.contains('Heart Rate', na=False)].head()
df_d_items[df_d_items['label'].str.contains('Blood Pressure', na=False)].head()
df_d_items[df_d_items['label'].str.contains('SpO2', na=False)].head()
df_d_items[df_d_items['label'].str.contains('Temperature', na=False)].head()


- Get only the critical vitals from the chart events

In [None]:
chart_events_list = [220045, 220050, 220051, 226253, 223761, 220210, 
                    220179, 220180, 220181, 223762, 220277, 223835, 224700]
for itemid in chart_events_list:
    print(f'{itemid}: {df_d_items[df_d_items["itemid"] == itemid]["label"].values[0]}')

- Filter the chart events for only the desired events

In [None]:

df_chart_events_f = df_chart_events[df_chart_events['itemid'].isin(chart_events_list)]
# Remove duplicate items in itemid
df_chart_events_f.drop_duplicates(subset=['subject_id', 'itemid'], inplace=True)
df_chart_events_f.head()


In [None]:
# Select the relevant columns
selected_columns = ['subject_id', 'hadm_id', 'stay_id', 'itemid', 'value']
df_chart_events_f = df_chart_events_f.loc[:, selected_columns]
df_chart_events_f.head()

- Pivot and create new columns for the chart event list

In [None]:
# Pivot on itemid
df_chart_events_pivot = df_chart_events_f.pivot(index='subject_id', columns='itemid', values='value').reset_index()
# Rename the columns
chart_events_list_str = [str(itemid) for itemid in chart_events_list]
df_chart_events_pivot.columns = ['subject_id'] + chart_events_list_str
df_chart_events_pivot.dropna(axis=0, inplace=True)
df_chart_events_pivot.head()


- Find the critical lab events

In [None]:
lab_events_list = [50813, 50882, 50912, 50809, 51221]
for itemid in lab_events_list:
    print(f'{itemid}: {df_d_labitems[df_d_labitems["itemid"] == itemid]["label"].values[0]}')

- Filter the lab events table

In [None]:
df_lab_events_f = df_lab_events[df_lab_events['itemid'].isin(lab_events_list)]
# Remove duplicate items in itemid
df_lab_events_f.drop_duplicates(subset=['subject_id', 'itemid'], inplace=True)
df_lab_events_f.head()


In [None]:
# Select the relevant columns
selected_columns = ['subject_id', 'itemid', 'value']
df_lab_events_f = df_lab_events_f.loc[:, selected_columns]
df_lab_events_f.head()

In [None]:
# Pivot on itemid
df_lab_events_pivot = df_lab_events_f.pivot(index='subject_id', columns='itemid', values='value').reset_index()
lab_events_list_str = [str(itemid) for itemid in lab_events_list]
df_lab_events_pivot.columns = ['subject_id'] + lab_events_list_str
df_lab_events_pivot.dropna(axis=0, inplace=True)
df_lab_events_pivot.head()

- Merge the chart, patients and lab events table

In [None]:
# Merge with patients
df_merged = df_chart_events_pivot.merge(df_patients_f, how='inner', on='subject_id')
df_merged = df_merged.merge(df_lab_events_pivot, how='inner', on='subject_id')
df_merged.head()

- Create the label(target) and the features

In [None]:

# Save the merged dataframe
df_merged.to_csv('merged_data.csv', index=False)


In [None]:
import pandas as pd

# Read the merged dataframe
df_merged = pd.read_csv('merged_data.csv')
target = df_merged['deceased']
X = df_merged.drop(columns=['subject_id', 'deceased'])
y = target


### Model training

In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Scale the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))


#### Machine Learning Models

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


# Regression models for comparison
models = [SGDRegressor(random_state = 0),
          GradientBoostingRegressor(random_state = 0),
          LinearRegression(),
          KNeighborsRegressor(),
          RandomForestRegressor(random_state = 0),]

results = {}

for model in models:

    # Instantiate and fit Regressor Model
    reg_model = model
    reg_model.fit(X_train, y_train)

    # Make predictions with model
    y_test_preds = reg_model.predict(X_test)

    # Grab model name and store results associated with model
    name = str(model).split("(")[0]

    results[name] = r2_score(y_test, y_test_preds)
    print('{} done.'.format(name))
    print("R^2 Score:", r2_score(y_test, y_test_preds))
    print("Mean Squared Error:", mean_squared_error(y_test, y_test_preds))
    #print(classification_report(y_test, (y_test_preds >= 0.5).astype(int)))

    # Confusion matrix
    y_test_preds = (y_test_preds >= 0.5).astype(int)
    cm = confusion_matrix(y_test, y_test_preds)
    print(cm)

In [None]:
# R2 score results
fig, ax = plt.subplots()
ind = range(len(results))
ax.barh(ind, list(results.values()), align='center',
        color = '#55a868', alpha=0.8)
ax.set_yticks(ind)
ax.set_yticklabels(results.keys())
ax.set_xlabel('R-squared score')
ax.tick_params(left=False, top=False, right=False)
ax.set_title('Comparison of Regression Models')
#fig.savefig('images/compare_models.png', bbox_inches = 'tight')

In [None]:
# GradientBoostingRegressor will be used as the deceased prediction model
reg_model = GradientBoostingRegressor(random_state=0)
reg_model.fit(X_train, y_train)
y_test_preds = reg_model.predict(X_test)
r2_not_refined = r2_score(y_test, y_test_preds)
print("R2 score is: {:2f}".format(r2_not_refined))

#### Model finetuning

In [None]:
# Split into train 80% and test 20%
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    target,
                                                    test_size = .20,
                                                    random_state = 42)

# Set the parameters by cross-validation
#tuned_parameters = [{'n_estimators': [100, 200, 300],
#                     'max_depth' : [2, 3, 4],
#                     'loss': ['ls', 'lad', 'huber']}]
tuned_parameters = [{'n_estimators': [200, 300],
                     'max_depth' : [3, 4],
                     'loss': ['squared_error', 'absolute_error']}]

# create and fit a ridge regression model, testing each alpha
reg_model = GradientBoostingRegressor()
grid = GridSearchCV(reg_model, tuned_parameters)
grid.fit(X_train, y_train)
reg_model_optimized = grid.best_estimator_

# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
#reg_model = GradientBoostingRegressor(n_estimators = 200, max_depth=4, random_state=0)
#reg_model.fit(X_train, y_train)
y_test_preds = reg_model_optimized.predict(X_test)
r2_optimized = r2_score(y_test, y_test_preds)
print("Optimized R2 score is: {:2f}".format(r2_optimized))

- Print the new confusion matrix after optimizing

In [None]:
y_test_preds = (y_test_preds >= 0.5).astype(int)
cm = confusion_matrix(y_test, y_test_preds)
print(cm)

In [None]:

print('Model refinement improved R2 score by {:.4f}'.format(r2_optimized-r2_not_refined))

### Create a Deep Learning Model

- Convert the data frame to tensors

In [None]:
# Convert df_merged to a tensor
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score

# Convert the data to PyTorch tensors
X = df_merged.drop(columns=['subject_id', 'deceased'])
y = df_merged['deceased']

# Convert all of the columns to float32
X = X.astype(np.float32)
X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device='mps')
print(X_tensor.shape)
y_tensor = torch.tensor(y.values, dtype=torch.float32).to(device='mps')
print(y_tensor.shape)


- Normalize the data

In [None]:
# Normalize the data
X_tensor = (X_tensor - X_tensor.mean(dim=0)) / X_tensor.std(dim=0)
y_tensor = (y_tensor - y_tensor.mean(dim=0)) / y_tensor.std(dim=0)
 

- Split the date into training and test data

In [None]:
# Split the data into training (80%) and testing sets (20%)
n_samples = X_tensor.shape[0]
n_val = int(n_samples * 0.2)
n_train = n_samples - n_val
train_data = X_tensor[:n_train].to(device='mps')
val_data = X_tensor[n_train:].to(device='mps')
train_labels = y_tensor[:n_train].to(device='mps')
val_labels = y_tensor[n_train:].to(device='mps')

- Create a binary classification model

In [None]:
# Create a binary classification model
class BinaryClassificationModel(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

- Train and evaluate the model

In [None]:
# Train the model
input_size = X_tensor.shape[1]
model = BinaryClassificationModel(input_size).to(device='mps')
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:

# Training loop
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(train_data)
    loss = criterion(outputs.squeeze(), train_labels)
    loss.backward()
    optimizer.step()

    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
with torch.inference_mode():
    model.eval()

    val_outputs = model(val_data)
    val_loss = criterion(val_outputs.squeeze(), val_labels)

    # Convert predictions to binary (0 or 1) for classification metrics
    val_preds = (val_outputs.squeeze() >= 0.5).float()
    
    # Ensure the predictions are integers for compatibility with classification metrics
    val_preds = val_preds.int()
    val_labels = val_labels.int()
    
    accuracy = accuracy_score(val_labels.int().cpu(), val_preds.int().cpu())
    print(f'Validation Accuracy: {accuracy:.4f}')