In [1]:
#used Libraries
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras import Model
from tensorflow.keras.models import Model
import os
import joblib

In [2]:
# Loading the dataset
dataset = pd.read_csv("Dataset/hospital_readmissions.csv")

In [4]:
# Preprocessing and converting age-ranges to midpoints
def age_range_as_midpoint(age_range):
    start, end = age_range[1:-1].split("-")
    midpoint = (int(start) + int(end)) / 2
    return midpoint

dataset['age'] = dataset['age'].apply(age_range_as_midpoint)

In [5]:
# Encoding categorical columns
categorical_columns = ['medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med']
for column in categorical_columns:
    dataset[column] = LabelEncoder().fit_transform(dataset[column])

In [6]:
# Splitting the dataset into features (X) and target variable (y)
X = dataset.drop('readmitted', axis=1)
y = dataset['readmitted']

In [7]:
# Define the parameter grid with reduced size
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

# Initialize Gradient Boosting Classifier
gbc = GradientBoostingClassifier()

In [8]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   2.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   2.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   4.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   4.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   4.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=   6.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_e

[CV] END ...learning_rate=0.5, max_depth=3, n_estimators=300; total time=   6.2s
[CV] END ...learning_rate=0.5, max_depth=3, n_estimators=300; total time=   6.0s
[CV] END ...learning_rate=0.5, max_depth=3, n_estimators=300; total time=   5.6s
[CV] END ...learning_rate=0.5, max_depth=3, n_estimators=300; total time=   5.5s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=100; total time=   3.0s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=100; total time=   3.1s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=100; total time=   2.7s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=100; total time=   3.0s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=100; total time=   2.8s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=200; total time=   5.8s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=200; total time=   5.7s
[CV] END ...learning_rate=0.5, max_depth=5, n_estimators=200; total time=   5.7s
[CV] END ...learning_rate=0.

In [10]:
# Trainning the Random Forest Classifier
rfc = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=None, min_samples_split=5, min_samples_leaf=2, n_jobs=-1)
rfc.fit(X_train, y_train)

In [11]:
# Evaluating the model
y_pred_train = rfc.predict(X_train)
y_pred_test = rfc.predict(X_test)

In [15]:
y_pred_train = rfc.predict(X_train)
y_pred_test = rfc.predict(X_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f'Training Accuracy: {train_accuracy}')
print(f'Testing Accuracy: {test_accuracy}')
print('Classification Report for Testing Set:')
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.9805
Testing Accuracy: 0.6086
Classification Report for Testing Set:
              precision    recall  f1-score   support

          no       0.62      0.69      0.65      2658
         yes       0.59      0.52      0.55      2342

    accuracy                           0.61      5000
   macro avg       0.61      0.60      0.60      5000
weighted avg       0.61      0.61      0.61      5000



In [24]:
# Define the directory for saving models
model_directory = "/Models"
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

# Save the scikit-learn model
model_version = 1
model_path = f"model{model_version}.pkl"
joblib.dump(rfc, model_path)
print("Scikit-learn model saved successfully.")

Scikit-learn model saved successfully.
