In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
admissions = pd.read_csv("admissions_202208161605.csv")
cptevents = pd.read_csv("cptevents_202208161605.csv", low_memory=False)
d_labitems = pd.read_csv("d_labitems_202208161605.csv")
diagnoses_icd = pd.read_csv("diagnoses_icd_202208161605.csv")
drgcodes = pd.read_csv("drgcodes_202208161605.csv")
labevents = pd.read_csv("labevents_202208161605.csv")
patients = pd.read_csv("patients_202208161605.csv")
procedures_icd = pd.read_csv("procedures_icd_202208161605.csv")

In [3]:
patients.head()

Unnamed: 0,row_id,subject_id,gender,dob,dod,dod_hosp,dod_ssn,expire_flag
0,234,249,F,2075-03-13 00:00:00.000,,,,0
1,235,250,F,2164-12-27 00:00:00.000,2188-11-22 00:00:00.000,2188-11-22 00:00:00.000,,1
2,236,251,M,2090-03-15 00:00:00.000,,,,0
3,237,252,M,2078-03-06 00:00:00.000,,,,0
4,238,253,F,2089-11-26 00:00:00.000,,,,0


In [4]:
admissions.head()

Unnamed: 0,row_id,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,diagnosis,hospital_expire_flag,has_chartevents_data
0,21,22,165315,2196-04-09 12:26:00.000,2196-04-10 15:54:00.000,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00.000,2196-04-09 13:24:00.000,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00.000,2153-09-08 19:10:00.000,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00.000,2157-10-25 14:00:00.000,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00.000,2139-06-09 12:48:00.000,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00.000,2160-11-05 14:55:00.000,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00.000,2160-11-02 04:27:00.000,ACUTE CORONARY SYNDROME,0,1


In [5]:
admissions["admittime"] = pd.to_datetime(admissions["admittime"])
admissions["dischtime"] = pd.to_datetime(admissions["dischtime"])

admissions = admissions.sort_values(["subject_id", "admittime"])

admissions["next_admit"] = admissions.groupby("subject_id")["admittime"].shift(-1)  # Next admission time
admissions["readmitted_30d"] = (admissions["next_admit"] - admissions["dischtime"]).dt.days <= 30
admissions["readmitted_30d"] = admissions["readmitted_30d"].astype(int)

admissions.drop(columns=["next_admit"], inplace=True)


In [6]:
df = admissions.merge(diagnoses_icd, on=["subject_id", "hadm_id"], how="left")

df.fillna(0, inplace=True)


In [7]:
df["length_of_stay"] = (df["dischtime"] - df["admittime"]).dt.days

df["previous_admissions"] = df.groupby("subject_id")["hadm_id"].cumcount()

heart_failure_codes = {'39891', '40201', '40211', '40291', '40401', '40403', '40411', '40413',
                       '40491', '40493', '4280', '4281', '42820', '42821', '42822', '42823', 
                       '42830', '42831', '42832', '42833', '42840', '42841', '42842', '42843', '4289'}

df["heart_failure"] = df["icd9_code"].astype(str).isin(heart_failure_codes).astype(int)


In [8]:
import pandas as pd

admissions["admittime"] = pd.to_datetime(admissions["admittime"], errors="coerce")
patients["dob"] = pd.to_datetime(patients["dob"], errors="coerce")

future_date_threshold = pd.Timestamp("2025-01-01")  # Adjust if needed
patients.loc[patients["dob"] > future_date_threshold, "dob"] = pd.NaT

admissions_patients = pd.merge(
    admissions[["subject_id", "admittime"]],
    patients[["subject_id", "dob"]],
    on="subject_id",
    how="left"
)

admissions_patients["age_at_admission"] = (
    (admissions_patients["admittime"].astype("int64") // 10**9 - 
     admissions_patients["dob"].astype("int64") // 10**9) / (60 * 60 * 24 * 365.25)
)

admissions_patients.loc[admissions_patients["age_at_admission"] > 300, "age_at_admission"] = 90
admissions_patients["age_at_admission"].fillna(admissions_patients["age_at_admission"].median(), inplace=True)

df = pd.merge(df, admissions_patients[["subject_id", "age_at_admission"]], on="subject_id", how="left")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  admissions_patients["age_at_admission"].fillna(admissions_patients["age_at_admission"].median(), inplace=True)


In [9]:
from sklearn.model_selection import train_test_split

X = df[["length_of_stay", "previous_admissions", "age_at_admission", "heart_failure"]]  # Add more features as needed
y = df["readmitted_30d"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [10]:
print(df.columns)


Index(['row_id_x', 'subject_id', 'hadm_id', 'admittime', 'dischtime',
       'deathtime', 'admission_type', 'admission_location',
       'discharge_location', 'insurance', 'language', 'religion',
       'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'diagnosis',
       'hospital_expire_flag', 'has_chartevents_data', 'readmitted_30d',
       'row_id_y', 'seq_num', 'icd9_code', 'length_of_stay',
       'previous_admissions', 'heart_failure', 'age_at_admission'],
      dtype='object')


In [11]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Unique values in y_train:", y_train.unique())


X_train shape: (1099983, 4)
y_train shape: (1099983,)
Unique values in y_train: [0 1]


In [14]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'booster': 'gbtree',
    "objective": "binary:logistic",  # Binary classification
    "eval_metric": "auc",  # Use AUC as the metric
    "max_depth": 6,  # Slightly increased depth
    "learning_rate": 0.1,  # Lower LR for stable learning
    "scale_pos_weight": sum(y_train == 0) / sum(y_train == 1),  # Handle imbalance
    "subsample": 0.95,  # More diverse trees
    "colsample_bytree": 0.85,  # More feature selection
    "min_child_weight": 2,  # Prevents small data splits
    "gamma": 0.3,  # Prunes trees with low gain
    "reg_lambda": 1.5,  # L2 regularization
}

watchlist = [(dtrain, "train"), (dtest, "eval")]
model = xgb.train(
    params, dtrain, num_boost_round=500, evals=watchlist, 
    early_stopping_rounds=50, verbose_eval=50 # Logs progress every 50 rounds
)

y_pred = model.predict(dtest)
y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary labels

accuracy = accuracy_score(y_test, y_pred_binary)
auc = roc_auc_score(y_test, y_pred)

print(f"\n🏆 Best Iteration: {model.best_iteration}")
print(f"🎯 Improved Accuracy: {accuracy:.4f}")
print(f"📊 Improved AUC Score: {auc:.4f}")


[0]	train-auc:0.56063	eval-auc:0.56090
[50]	train-auc:0.69928	eval-auc:0.70129
[100]	train-auc:0.70984	eval-auc:0.71138
[150]	train-auc:0.71560	eval-auc:0.71677
[200]	train-auc:0.72092	eval-auc:0.72174
[250]	train-auc:0.72450	eval-auc:0.72502
[300]	train-auc:0.72725	eval-auc:0.72743
[350]	train-auc:0.72996	eval-auc:0.72984
[400]	train-auc:0.73232	eval-auc:0.73201
[450]	train-auc:0.73370	eval-auc:0.73313
[499]	train-auc:0.73534	eval-auc:0.73447

🏆 Best Iteration: 499
🎯 Improved Accuracy: 0.7142
📊 Improved AUC Score: 0.7345
