# Process Data for Machine Learning

In [2]:
import os
import pandas as pd
import numpy as np
import sys
import zipfile
from dotenv import load_dotenv
from IPython.display import Markdown
import matplotlib.pyplot as plt
import seaborn as sns
load_dotenv()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
ZIP_FILE = "datasets/hospital_readmission/diabetic_data_initial.csv.zip"

import zipfile
import pandas as pd

with zipfile.ZipFile(ZIP_FILE, 'r') as z:
    print(z.namelist())  # Optional: check file inside ZIP
    with z.open(z.namelist()[0]) as f:
        df = pd.read_csv(f)

df.head()



['diabetic_data_initial.csv', '__MACOSX/._diabetic_data_initial.csv']


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
#checking shape of the dataset
df.shape

#Checking data types of each variable
df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [5]:
#Checking for missing values in dataset
#In the dataset missing values are represented as '?' sign
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())

race 2273
gender 0
age 0
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0


In [6]:
# gender was coded differently so we use a custom count for this one            
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count())   

gender 3


In [7]:
#dropping columns with large number of missing values
df = df.drop(['weight','payer_code','medical_specialty'], axis = 1)

In [8]:
drop_Idx = set(df[(df['diag_1'] == '?') & (df['diag_2'] == '?') & (df['diag_3'] == '?')].index)

drop_Idx = drop_Idx.union(set(df['diag_1'][df['diag_1'] == '?'].index))
drop_Idx = drop_Idx.union(set(df['diag_2'][df['diag_2'] == '?'].index))
drop_Idx = drop_Idx.union(set(df['diag_3'][df['diag_3'] == '?'].index))
drop_Idx = drop_Idx.union(set(df['race'][df['race'] == '?'].index))
drop_Idx = drop_Idx.union(set(df[df['discharge_disposition_id'] == 11].index))
drop_Idx = drop_Idx.union(set(df['gender'][df['gender'] == 'Unknown/Invalid'].index))
new_Idx = list(set(df.index) - set(drop_Idx))
df = df.iloc[new_Idx]

In [9]:
#Checking for missing values in the data
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())
            
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count())   

race 0
gender 0
age 0
diag_1 0
diag_2 0
diag_3 0
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0
gender 0


In [10]:
readmit_map = {'<30': 0, '>30': 1, 'NO': 2}
df['readmitted'] = df['readmitted'].map(readmit_map)

In [11]:
# Drop rows where mapping failed (if any)
df = df[df['readmitted'].notna()]
df['readmitted'] = df['readmitted'].astype(int)

In [12]:
# Step 3: Select Features
# -----------------------------
selected_cols = [
    'race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id',
    'admission_source_id', 'time_in_hospital', 'num_lab_procedures',
    'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
    'metformin', 'insulin', 'change', 'diabetesMed'
]

X = df[selected_cols]
y = df['readmitted']

In [13]:
X = df[selected_cols]
y = df['readmitted']

# -----------------------------
# Step 4: Encode Categorical Features
# -----------------------------
X = X.copy()
for col in X.select_dtypes(include='object').columns:
    X[col] = LabelEncoder().fit_transform(X[col])


In [14]:
print("Class distribution:\n", y.value_counts())


Class distribution:
 readmitted
2    50731
1    34649
0    11066
Name: count, dtype: int64


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)



In [16]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [17]:
print("Resampled training set class distribution:")
print(pd.Series(y_train_resampled).value_counts())

Resampled training set class distribution:
readmitted
1    35512
2    35512
0    35512
Name: count, dtype: int64


In [18]:
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_resampled)

In [19]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

f1_macro = make_scorer(f1_score, average='macro')

clf = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    scoring=f1_macro,
    n_iter=25,
    cv=3,
    verbose=1,
    n_jobs=-1
)

clf.fit(X_train_resampled, y_train_resampled, sample_weight=sample_weights)



Fitting 3 folds for each of 25 candidates, totalling 75 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [21]:
best_params = clf.best_params_

final_model = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False,
    **best_params
)

final_model.fit(
    X_train_resampled,
    y_train_resampled,
    sample_weight=sample_weights,
    eval_set=[(X_test, y_test)],
    verbose=True
)

# Step 6: Threshold tuning for class <30>
probs = final_model.predict_proba(X_test)

# Custom threshold for class 0 ("<30")
threshold = 0.3
y_pred_custom = np.where(probs[:, 0] > threshold, 0, np.argmax(probs, axis=1))

# Step 7: Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_custom))
print("Classification Report:\n", classification_report(y_test, y_pred_custom, target_names=['<30', '>30', 'NO']))

[0]	validation_0-mlogloss:1.08902
[1]	validation_0-mlogloss:1.07979
[2]	validation_0-mlogloss:1.07096
[3]	validation_0-mlogloss:1.06524


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[4]	validation_0-mlogloss:1.05910
[5]	validation_0-mlogloss:1.05266
[6]	validation_0-mlogloss:1.04723
[7]	validation_0-mlogloss:1.04211
[8]	validation_0-mlogloss:1.03796
[9]	validation_0-mlogloss:1.03353
[10]	validation_0-mlogloss:1.02888
[11]	validation_0-mlogloss:1.02563
[12]	validation_0-mlogloss:1.02206
[13]	validation_0-mlogloss:1.01968
[14]	validation_0-mlogloss:1.01639
[15]	validation_0-mlogloss:1.01352
[16]	validation_0-mlogloss:1.01085
[17]	validation_0-mlogloss:1.00850
[18]	validation_0-mlogloss:1.00637
[19]	validation_0-mlogloss:1.00445
[20]	validation_0-mlogloss:1.00292
[21]	validation_0-mlogloss:1.00138
[22]	validation_0-mlogloss:1.00022
[23]	validation_0-mlogloss:0.99868
[24]	validation_0-mlogloss:0.99733
[25]	validation_0-mlogloss:0.99602
[26]	validation_0-mlogloss:0.99480
[27]	validation_0-mlogloss:0.99368
[28]	validation_0-mlogloss:0.99270
[29]	validation_0-mlogloss:0.99139
[30]	validation_0-mlogloss:0.99045
[31]	validation_0-mlogloss:0.98972
[32]	validation_0-mlogloss

In [23]:
# Save the final trained model
final_model.save_model("xgb_model.json")


In [None]:

# Save model as JSON (recommended for MCP/XGBoost tools)
xgb.save_model("xgb_model.json")

In [None]:
y_pred = xgb.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
samples = []
for class_label in [0, 1, 2]:
    subset = X_test[y_test == class_label]
    samples.append(subset.sample(n=20, random_state=42))

sample_df = pd.concat(samples)
sample_preds = xgb.predict(sample_df)

print("\nSample Predictions (20 per class):")
print(pd.DataFrame({
    'Actual': y.loc[sample_df.index],
    'Predicted': sample_preds
}))

In [None]:
# Combine the samples into one test DataFrame
sample_df = pd.concat(samples)

# Save the test input without 'readmitted' label to CSV
sample_df.to_csv("test_set_for_claude.csv", index=False)