<a href="https://colab.research.google.com/github/siddharth0517/Hospital-Readmission-Prediction-using-XGBoost/blob/main/Predicting_Hospital_Readmission_Using_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Libraries**

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

**Loading Dataset and Preprocessing**

In [14]:
data = pd.read_csv('data.csv')

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                25000 non-null  object
 1   time_in_hospital   25000 non-null  int64 
 2   n_lab_procedures   25000 non-null  int64 
 3   n_procedures       25000 non-null  int64 
 4   n_medications      25000 non-null  int64 
 5   n_outpatient       25000 non-null  int64 
 6   n_inpatient        25000 non-null  int64 
 7   n_emergency        25000 non-null  int64 
 8   medical_specialty  25000 non-null  object
 9   diag_1             25000 non-null  object
 10  diag_2             25000 non-null  object
 11  diag_3             25000 non-null  object
 12  glucose_test       25000 non-null  object
 13  A1Ctest            25000 non-null  object
 14  change             25000 non-null  object
 15  diabetes_med       25000 non-null  object
 16  readmitted         25000 non-null  objec

In [16]:
# Handle missing values in 'medical_specialty' by replacing 'Missing' with NaN and imputing a value
data['medical_specialty'] = data['medical_specialty'].replace('Missing', np.nan)

data['medical_specialty'] = data['medical_specialty'].astype('object')

In [17]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data['medical_specialty'] = imputer.fit(data[['medical_specialty']])

In [18]:
# Using LabelEncoder for binary features
binary_cols = ['glucose_test', 'A1Ctest', 'change', 'diabetes_med', 'readmitted']
le = LabelEncoder()

for col in binary_cols:
    data[col] = le.fit_transform(data[col])

In [21]:
# Use OneHotEncoder for diagnosis columns and medical_specialty
categorical_cols = ['age','diag_1', 'diag_2', 'diag_3', 'medical_specialty']
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = one_hot_encoder.fit_transform(data[categorical_cols])

In [22]:
encoded_df = pd.DataFrame(encoded_categorical, columns=one_hot_encoder.get_feature_names_out(categorical_cols))
data = pd.concat([data.drop(columns=categorical_cols), encoded_df], axis=1)


**Splitting the Dataset**

In [24]:
X= data.iloc[:,:-1].values
y= data.iloc[:,-1].values

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Training XGBoost Model**

In [26]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)


**Hypertuning**

In [29]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 300, 500],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3]
}

grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring='roc_auc', cv=3)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [30]:
from sklearn.metrics import classification_report, roc_auc_score

In [31]:
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
print(f'ROC-AUC Score: {roc_auc}')


              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      4648
         1.0       0.30      0.01      0.02       352

    accuracy                           0.93      5000
   macro avg       0.62      0.50      0.49      5000
weighted avg       0.89      0.93      0.90      5000

ROC-AUC Score: 0.8731767573540916


In [32]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 91.69 %
Standard Deviation: 0.30 %
