In [1]:
# Phase6_1: Mount Drive & Load Data

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# File paths
mm_path = "/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC/data/RA_Multimodal_ClinicalBERT_FINAL.csv"
adm_path = "/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC/data/admissions.csv"

# Load datasets
df_mm = pd.read_csv(mm_path)
df_adm = pd.read_csv(adm_path)

print("Multimodal DF shape:", df_mm.shape)
print("Admissions DF shape:", df_adm.shape)

# Preview
df_mm.head()


Mounted at /content/drive
Multimodal DF shape: (15462, 813)
Admissions DF shape: (546028, 16)


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,...,cb_mean_759,cb_mean_760,cb_mean_761,cb_mean_762,cb_mean_763,cb_mean_764,cb_mean_765,cb_mean_766,cb_mean_767,note_id
0,10002443,21329020,2183-10-17 23:20:00,2183-10-20 18:47:00,,EW EMER.,P343TV,TRANSFER FROM HOSPITAL,HOME,Private,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,10003203,25146996,2153-04-26 02:05:00,2153-04-29 14:19:00,,EU OBSERVATION,P57BOT,EMERGENCY ROOM,,Medicare,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,10010718,29947356,2169-01-20 13:21:00,2169-01-27 14:20:00,,OBSERVATION ADMIT,P50GUR,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,10010997,20783870,2139-04-28 16:45:00,2139-05-02 12:20:00,,OBSERVATION ADMIT,P756E2,TRANSFER FROM HOSPITAL,HOME,Private,...,0.068613,-0.290431,-0.107542,0.25966,-0.086897,0.083217,0.208328,-0.113016,-0.178819,10010997-DS-9
4,10010997,20783870,2139-04-28 16:45:00,2139-05-02 12:20:00,,OBSERVATION ADMIT,P756E2,TRANSFER FROM HOSPITAL,HOME,Private,...,0.068613,-0.290431,-0.107542,0.25966,-0.086897,0.083217,0.208328,-0.113016,-0.178819,10010997-DS-9


In [2]:
# Phase6_2: Create 30-day readmission labels from admissions.csv

import pandas as pd
import numpy as np

adm = df_adm.copy()

# Convert times to datetime
adm['admittime'] = pd.to_datetime(adm['admittime'])
adm['dischtime'] = pd.to_datetime(adm['dischtime'])

# Sort by patient and admission time
adm = adm.sort_values(['subject_id', 'admittime'])

# Shift to get next admission for each patient
adm['next_admittime'] = adm.groupby('subject_id')['admittime'].shift(-1)

# Compute time difference between discharge and next admission
adm['days_to_next'] = (adm['next_admittime'] - adm['dischtime']).dt.days

# Create 30-day readmission label
adm['readmit_30d'] = np.where((adm['days_to_next'] >= 0) & (adm['days_to_next'] <= 30), 1, 0)

# Keep only hadm_id + label
readmit_df = adm[['hadm_id', 'readmit_30d']]

print("Readmission labels created:", readmit_df.shape)
readmit_df.head(10)


Readmission labels created: (546028, 2)


Unnamed: 0,hadm_id,readmit_30d
0,22595853,0
1,22841357,1
3,29079034,1
2,25742920,0
4,25022803,0
5,23052089,0
6,29888819,0
7,27250926,0
8,22927623,0
9,27988844,0


In [3]:
# Phase6_3: Merge readmission labels into multimodal dataset

df_merge = df_mm.merge(readmit_df, on="hadm_id", how="left")

print("Merged DF shape:", df_merge.shape)

# Check label distribution
print("\nReadmission label counts:")
print(df_merge['readmit_30d'].value_counts(dropna=False))

# Show preview
df_merge[['hadm_id', 'readmit_30d']].head(10)


Merged DF shape: (15462, 814)

Readmission label counts:
readmit_30d
NaN    7068
0.0    6366
1.0    2028
Name: count, dtype: int64


Unnamed: 0,hadm_id,readmit_30d
0,21329020,
1,25146996,
2,29947356,
3,20783870,0.0
4,20783870,0.0
5,28291416,
6,21854604,
7,22523752,
8,24521344,0.0
9,28118562,0.0


In [4]:
# Phase6_4: Filter valid labeled rows & build X and y

df_readmit = df_merge.dropna(subset=['readmit_30d']).copy()

print("Filtered readmission DF:", df_readmit.shape)

# Convert label to int
df_readmit['readmit_30d'] = df_readmit['readmit_30d'].astype(int)

# Show label counts again
print("\nUpdated readmission label balance:")
print(df_readmit['readmit_30d'].value_counts())

# Build X and y
# X = all features except identifiers and label
X_readmit = df_readmit.drop(columns=['readmit_30d', 'subject_id', 'hadm_id', 'note_id'])

y_readmit = df_readmit['readmit_30d'].values

print("\nX_readmit shape:", X_readmit.shape)
print("y_readmit shape:", y_readmit.shape)


Filtered readmission DF: (8394, 814)

Updated readmission label balance:
readmit_30d
0    6366
1    2028
Name: count, dtype: int64

X_readmit shape: (8394, 810)
y_readmit shape: (8394,)


In [7]:
# ======================================================
# Phase6_5 (FINAL): Clean X → Impute NaN → SMOTE → XGBoost
# ======================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# -------------------------------------------------------
# 1) Remove non-numeric columns
# -------------------------------------------------------
non_numeric_cols = X_readmit.select_dtypes(include=['object']).columns.tolist()
print("Non-numeric columns removed:", non_numeric_cols)

X_readmit_clean = X_readmit.drop(columns=non_numeric_cols)
print("Shape after removing non-numeric:", X_readmit_clean.shape)

# -------------------------------------------------------
# 2) Impute NaN values (SMOTE requires NO missing values)
# -------------------------------------------------------
X_readmit_clean = X_readmit_clean.fillna(0)

print("Any remaining NaNs?", X_readmit_clean.isna().sum().sum())

# -------------------------------------------------------
# 3) Train/Test Split
# -------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_readmit_clean,
    y_readmit,
    test_size=0.20,
    random_state=42,
    stratify=y_readmit
)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Train label counts:", np.bincount(y_train))
print("Test label counts:", np.bincount(y_test))

# -------------------------------------------------------
# 4) Apply SMOTE
# -------------------------------------------------------
sm = SMOTE(k_neighbors=5, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("\nAfter SMOTE:", np.bincount(y_train_res))

# -------------------------------------------------------
# 5) Convert to DMatrix
# -------------------------------------------------------
dtrain = xgb.DMatrix(X_train_res, label=y_train_res)
dtest = xgb.DMatrix(X_test, label=y_test)

# -------------------------------------------------------
# 6) XGBoost Parameters
# -------------------------------------------------------
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "hist",
    "max_depth": 10,
    "eta": 0.03,
    "subsample": 0.9,
    "colsample_bytree": 0.9
}

# -------------------------------------------------------
# 7) Train Model
# -------------------------------------------------------
evals = [(dtrain, "train"), (dtest, "test")]
model_readmit = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    verbose_eval=50
)

# -------------------------------------------------------
# 8) Predictions
# -------------------------------------------------------
y_pred_prob = model_readmit.predict(dtest)
y_pred = (y_pred_prob >= 0.5).astype(int)

# -------------------------------------------------------
# 9) Evaluation
# -------------------------------------------------------
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_pred_prob)
print("\nAUC Score:", auc_score)


Non-numeric columns removed: ['admittime', 'dischtime', 'deathtime', 'admission_type', 'admit_provider_id', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'race', 'edregtime', 'edouttime', 'curr_service']
Shape after removing non-numeric: (8394, 796)
Any remaining NaNs? 0

Train shape: (6715, 796)
Test shape: (1679, 796)
Train label counts: [5093 1622]
Test label counts: [1273  406]

After SMOTE: [5093 5093]
[0]	train-logloss:0.68232	test-logloss:0.68887
[50]	train-logloss:0.40081	test-logloss:0.58362
[100]	train-logloss:0.29724	test-logloss:0.55271
[150]	train-logloss:0.23577	test-logloss:0.53830
[200]	train-logloss:0.19906	test-logloss:0.53218
[250]	train-logloss:0.17236	test-logloss:0.53095
[300]	train-logloss:0.15085	test-logloss:0.53254
[350]	train-logloss:0.13525	test-logloss:0.53608
[400]	train-logloss:0.12148	test-logloss:0.54034
[450]	train-logloss:0.11029	test-logloss:0.54320
[499]	train-logloss:0.10071	test-logloss:0.54667

Classificat

In [10]:
# ==========================================================
# Phase6_5_LGBM_Fixed: LightGBM Model for Readmission Prediction
# ==========================================================

import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Create LightGBM Dataset
train_data = lgb.Dataset(X_train_res, label=y_train_res)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# LightGBM Parameters (optimized for your task)
params_lgb = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 64,
    'max_depth': -1,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'lambda_l2': 1.0,
    'lambda_l1': 0.0,
    'min_data_in_leaf': 40,
    'class_weight': 'balanced'
}

# Use callbacks for logging
callbacks = [lgb.log_evaluation(period=50)]

# Train the model
model_lgb = lgb.train(
    params_lgb,
    train_data,
    num_boost_round=800,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'test'],
    callbacks=callbacks
)

# Predictions
y_pred_prob_lgb = model_lgb.predict(X_test)
y_pred_lgb = (y_pred_prob_lgb >= 0.5).astype(int)

# Evaluation
print("\n=== LightGBM Model Performance ===")
print(classification_report(y_test, y_pred_lgb, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgb))

auc_lgb = roc_auc_score(y_test, y_pred_prob_lgb)
print("\nAUC Score:", auc_lgb)


[LightGBM] [Info] Number of positive: 5093, number of negative: 5093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.199590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 202719
[LightGBM] [Info] Number of data points in the train set: 10186, number of used features: 796
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[50]	train's binary_logloss: 0.514541	test's binary_logloss: 0.610898
[100]	train's binary_logloss: 0.417587	test's binary_logloss: 0.577652
[150]	train's binary_logloss: 0.353869	test's binary_logloss: 0.559263
[200]	train's binary_logloss: 0.308151	test's binary_logloss: 0.549178
[250]	train's binary_logloss: 0.273162	test's binary_logloss: 0.542463
[300]	train's binary_logloss: 0.245308	test's binary_logloss: 0.538258
[350]	train's binary_logloss: 0.221454	test's binary_logloss: 0.535179
[400]	train's binary_logloss: 0.202229	test's binary_logloss: 0.