In [1]:
# Machine Learning Models
from sklearn.linear_model import LogisticRegression
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from sklearn.metrics import make_scorer
 
from sklearn.metrics import classification_report
import random
random.seed(100)
 
import time
import pyodbc
print(pyodbc.drivers())
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
 
# Data Preprocessing
from sklearn.model_selection import train_test_split

# mlflow
import mlflow
import mlflow.sklearn
from sqlalchemy import create_engine, text

['SQL Server', 'ODBC Driver 17 for SQL Server', 'SQL Server Native Client RDA 11.0', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)', 'Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)']


In [2]:
# Loading validation data
val_data = pd.read_csv('../data/interim/Final_data_validation.csv')
val_data.shape

(16326, 27)

In [3]:
val_data.head(3)

Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,max_glu_serum,A1Cresult,metformin,insulin,change,diabetesMed,readmitted,admission_type_desc,discharge_category,admission_category,Specialty_Group,payer_code_group,number_outpatient_log,number_inpatient_log,number_emergency_log,diag_3_cat,Patient_Age
0,0,268763496,89048466,AfricanAmerican,Female,3,14,0,12,9,No,Norm,1,1,Ch,Yes,1,Emergency,Discharged to Home,Emergency Admission,Other,Government Programs,0.693147,0.693147,0.0,Circulatory,52
1,1,268777020,50550156,Caucasian,Male,4,37,1,20,9,No,No,0,0,No,No,0,Emergency,Discharged to Home,Emergency Admission,Other,Self-Pay/Other,0.0,0.0,0.0,Respiratory,54
2,2,268780680,67522518,Caucasian,Female,4,63,0,29,9,No,No,0,2,Ch,Yes,0,Emergency,Transfers to Other Healthcare Facilities,Emergency Admission,Other,Government Programs,0.0,0.693147,0.693147,Genitourinary,88


In [4]:
val_data.drop(columns={'Unnamed: 0', 'patient_nbr','gender'}, inplace=True)

In [5]:
len(val_data.columns)

24

In [6]:
val_data.columns

Index(['encounter_id', 'race', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'insulin', 'change',
       'diabetesMed', 'readmitted', 'admission_type_desc',
       'discharge_category', 'admission_category', 'Specialty_Group',
       'payer_code_group', 'number_outpatient_log', 'number_inpatient_log',
       'number_emergency_log', 'diag_3_cat', 'Patient_Age'],
      dtype='object')

In [7]:
val_data['readmitted'].value_counts(normalize=True)

readmitted
0    0.607313
1    0.392687
Name: proportion, dtype: float64

In [8]:
# dropping unnecessary columns
val_data.drop(columns= {'max_glu_serum', 'Specialty_Group'}, inplace=True)

In [9]:
val_data.select_dtypes(include='object').columns

Index(['race', 'A1Cresult', 'change', 'diabetesMed', 'admission_type_desc',
       'discharge_category', 'admission_category', 'payer_code_group',
       'diag_3_cat'],
      dtype='object')

In [10]:
### One-hot encoding
cat_cols = ['race', 'A1Cresult', 'change', 'diabetesMed',
       'admission_type_desc', 'discharge_category', 'admission_category',
       'payer_code_group', 'diag_3_cat']

data_encoded = pd.get_dummies(val_data, columns = cat_cols, drop_first=True)
data_encoded.shape

(16326, 41)

In [11]:
X_val = data_encoded.drop(columns='readmitted')

y_val = data_encoded['readmitted']

In [12]:
## lOading model
model = pd.read_pickle('../model/lr_model.pkl')

In [13]:
# Predictions
y_prob_val = model.predict_proba(X_val.drop(['encounter_id'], axis=1))[:, 1]  # Probability of positive class
y_pred_val = model.predict(X_val.drop(['encounter_id'], axis=1))
y_pred_val = (y_prob_val >= 0.45).astype(int)


# Metrics for validation Data
print("\nLogistic Regression Model Validation Report:")
print(classification_report(y_val, y_pred_val))
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred_val)}")
print(f"Validation Precision: {precision_score(y_val, y_pred_val)}")
print(f"Validation Recall: {recall_score(y_val, y_pred_val)}")
print(f"Validation F1 Score: {f1_score(y_val, y_pred_val)}")


Logistic Regression Model Validation Report:
              precision    recall  f1-score   support

           0       0.75      0.46      0.57      9915
           1       0.48      0.77      0.59      6411

    accuracy                           0.58     16326
   macro avg       0.62      0.61      0.58     16326
weighted avg       0.65      0.58      0.58     16326

Validation Accuracy: 0.5799950998407448
Validation Precision: 0.4783158304161805
Validation Recall: 0.7672749961004524
Validation F1 Score: 0.5892782270140761


In [14]:
_val_probabilities = model.predict_proba(X_val.drop(['encounter_id'], axis=1))[:, 1]

X_val['Readmission_Probabilities'] = _val_probabilities


In [15]:
decile_ranges = [
    (0.709328, 1.0),  # Decile 1
    (0.620649, 0.709294),  # Decile 2
    (0.550039, 0.620640),  # Decile 3
    (0.499012, 0.550039),  # Decile 4
    (0.461631, 0.499006),  # Decile 5
    (0.429849, 0.461628),  # Decile 6
    (0.396432, 0.429846),  # Decile 7
    (0.358978, 0.396431),  # Decile 8
    (0.310055, 0.358972),  # Decile 9
    (0.00, 0.309995)   # Decile 10
]

# FUnction to assign deciles based on training probability
def assign_decile(prob):
    for i, (min_val, max_val) in enumerate(decile_ranges, start=1):
        if min_val <= prob <= max_val:
            return i
    return None  

# Applying the function to assign deciles to validation probabilities
X_val['Deciles'] = X_val['Readmission_Probabilities'].apply(assign_decile)


In [16]:
X_val['Actual_Target'] = y_val
X_val['Predicted_Target'] = y_pred_val

In [17]:
X_val.to_clipboard()
# Copying to excel for further analysis