In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier



In [2]:
# diabetes_data = pd.read_csv("cleaned_diabetes_data.csv")
diabetes_data = pd.read_pickle("cleaned_diabetes_data.pkl")


In [3]:
diabetes_data.dtypes

race                        category
gender                      category
age                            int64
admission_type_id              int64
discharge_disposition_id       int64
admission_source_id            int64
time_in_hospital               int64
num_lab_procedures             int64
num_procedures                 int64
num_medications                int64
number_outpatient              int64
number_emergency               int64
number_inpatient               int64
diag_1                      category
diag_2                      category
diag_3                      category
number_diagnoses               int64
max_glu_serum               category
A1Cresult                   category
metformin                   category
repaglinide                 category
nateglinide                 category
chlorpropamide              category
glimepiride                 category
acetohexamide               category
glipizide                   category
glyburide                   category
t

In [4]:
diabetes_data.head()
# diabetes_data = diabetes_data.drop(columns=["A1Cresult","max_glu_serum",'metformin',
#  'repaglinide',
#  'nateglinide',
#  'chlorpropamide',
#  'glimepiride',
#  'acetohexamide',
#  'glipizide',
#  'glyburide',
#  'tolbutamide',
#  'pioglitazone',
#  'rosiglitazone',
#  'acarbose',
#  'miglitol',
#  'troglitazone',
#  'tolazamide',
#  'examide',
#  'citoglipton',
#  'insulin',
#  'glyburide-metformin',
#  'glipizide-metformin',
#  'glimepiride-pioglitazone',
#  'metformin-rosiglitazone',
#  'metformin-pioglitazone', ])


# diabetes_data["emergency_inpatient_outpatient"] = (diabetes_data['number_emergency'] +
#                                         diabetes_data['number_inpatient'] +
#                                         diabetes_data['number_outpatient'])
# diabetes_data = diabetes_data.drop(columns=['number_emergency', 'number_inpatient', 'number_outpatient'])

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,Caucasian,Female,15,1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,25,1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,35,1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,45,1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,Caucasian,Male,55,2,1,2,3,31,6,16,...,No,Steady,No,No,No,No,No,No,Yes,>30


In [5]:
target_data = diabetes_data["readmitted"]
# label_encoder = LabelEncoder()
# target_data = label_encoder.fit_transform(target_data)

In [6]:
target_data = target_data.replace({'<30': 1, '>30':1, "NO":0})


In [7]:
target_data.unique()

[1, 0]
Categories (2, int64): [1, 0]

In [8]:
features_diabetes_data = diabetes_data.drop(columns="readmitted")
features_diabetes_data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
1,Caucasian,Female,15,1,1,7,3,59,0,18,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,AfricanAmerican,Female,25,1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,No,Yes
3,Caucasian,Male,35,1,1,7,2,44,1,16,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,Caucasian,Male,45,1,1,7,1,51,0,8,...,No,No,Steady,No,No,No,No,No,Ch,Yes
5,Caucasian,Male,55,2,1,2,3,31,6,16,...,No,No,Steady,No,No,No,No,No,No,Yes


In [9]:
#Using the train test split from Sklearn. note that I am using startify because I want the proportions
#of the target to be balanced

In [10]:
x_train, x_test, y_train, y_test = train_test_split(features_diabetes_data,target_data , train_size=.8,random_state=42, stratify=target_data)

In [11]:
# insanity test for confirming the split
print(f"x_train length  is {len(x_train)} which  should be equal to x_train of length {len(y_train)}")

x_train length  is 78500 which  should be equal to x_train of length 78500


In [12]:
x_train.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
8029,Caucasian,Female,55,2,31,1,8,45,0,16,...,No,No,Steady,No,No,No,No,No,Ch,Yes
34129,Caucasian,Male,75,5,3,27,3,7,2,29,...,No,No,No,No,No,No,No,No,No,No
10672,Caucasian,Female,85,1,31,7,6,38,0,8,...,No,No,Up,No,No,No,No,No,Ch,Yes
56844,Caucasian,Female,75,5,1,27,4,70,6,18,...,No,No,Up,No,No,No,No,No,Ch,Yes
62690,Caucasian,Male,75,2,1,7,4,10,0,12,...,No,No,Steady,No,No,No,No,No,Ch,Yes


In [13]:
y_train.head()

8029     0
34129    0
10672    1
56844    1
62690    1
Name: readmitted, dtype: category
Categories (2, int64): [1, 0]

In [14]:
categorical_features = [feature for feature in x_train.columns[x_train.dtypes=="category"]]
numerical_features = [feature for feature in diabetes_data.columns[diabetes_data.dtypes=="int64"]]

In [15]:
categorical_features

['race',
 'gender',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed']

In [16]:
print(f"Length of {len(categorical_features)} + {len(numerical_features)} = {len(diabetes_data.columns)}")

Length of 32 + 12 = 45


# Numerical and categorical pipeline


In [17]:
num_pipeline = Pipeline(steps= [
                                ("imputer", SimpleImputer(strategy="mean")),
                                ("scaler", StandardScaler())
                        
])
cat_pipeline = Pipeline(steps=[
                                ("imputer",SimpleImputer(strategy="most_frequent")),
                                ("one_hot_encode", OneHotEncoder(drop="first", sparse_output=False,handle_unknown='ignore'))
])


# Column transfer to combine pipeline

In [18]:
preprocessor = ColumnTransformer(
                transformers = [ ("number", num_pipeline, numerical_features),
                                ("category", cat_pipeline, categorical_features)
                               ]
                    
)

# bulild a model using Logistic regression

In [19]:
log_reg_pipeline = Pipeline(steps=[("preprocessor", preprocessor),("classifier", LogisticRegression(max_iter=2000))
                                  ])


Now let us fit the pipeline on the data

# i first tried max iter = 1000 but it error out because the iteration was too small to reach a solution so
#I doubled to iteration to 2000

In [20]:
log_reg_pipeline.fit(x_train, y_train)

In [21]:
readmitanc_predict = log_reg_pipeline.predict(x_test)



In [22]:
print(f"Report {classification_report(y_test, readmitanc_predict)}")

Report               precision    recall  f1-score   support

           0       0.61      0.78      0.69     10474
           1       0.63      0.44      0.52      9151

    accuracy                           0.62     19625
   macro avg       0.62      0.61      0.60     19625
weighted avg       0.62      0.62      0.61     19625



In [23]:
print(f"Accuracy is {accuracy_score(y_test, readmitanc_predict)}")

Accuracy is 0.6199235668789809


In [24]:
readmitanc_predict[45:55]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

In [25]:
y_test[45:55]

32017    0
74268    0
21660    0
18213    1
3119     0
38680    1
40687    0
56003    0
83949    0
4122     1
Name: readmitted, dtype: category
Categories (2, int64): [1, 0]

# Lets use Random Forest

In [26]:

random_forest_pipeline = Pipeline(steps=[("preprocessor", preprocessor),("classifier", RandomForestClassifier(random_state=42))
                                  ])
random_forest_pipeline.fit(x_train, y_train)


In [27]:
# Now predicting
readmitanc_predict_rf = random_forest_pipeline.predict(x_test)



In [28]:
print(f"Accuracy is {accuracy_score(y_test, readmitanc_predict_rf)}")

Accuracy is 0.6289426751592356


In [29]:
print(f"Report {classification_report(y_test, readmitanc_predict_rf)}")

Report               precision    recall  f1-score   support

           0       0.63      0.72      0.67     10474
           1       0.62      0.52      0.57      9151

    accuracy                           0.63     19625
   macro avg       0.63      0.62      0.62     19625
weighted avg       0.63      0.63      0.63     19625



#using class_weight parameter to give more wieght to the minority class and see the difference

In [30]:
rand_forest_cw_included = RandomForestClassifier(class_weight="balanced",random_state=42)
random_forest_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                         ("classifier", rand_forest_cw_included)])


In [31]:
random_forest_pipeline.fit(x_train,y_train)
predict_random_forest = random_forest_pipeline.predict(x_test)



In [32]:
print(f"Accuracy is {accuracy_score(y_test, predict_random_forest)}")

Accuracy is 0.6305732484076433


In [33]:
print(f"Report {classification_report(y_test, predict_random_forest)}")

Report               precision    recall  f1-score   support

           0       0.63      0.73      0.68     10474
           1       0.63      0.51      0.57      9151

    accuracy                           0.63     19625
   macro avg       0.63      0.62      0.62     19625
weighted avg       0.63      0.63      0.63     19625



In [34]:
# XGBClassifier boost Pipeline
xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42))
])



In [35]:
xgb_pipeline.fit(x_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [36]:
readmitance_predict_xbg = xgb_pipeline.predict(x_test)



In [37]:
readmitance_predict_xbg
print(f"Accuracy is {accuracy_score(y_test, readmitance_predict_xbg)}")

Accuracy is 0.642547770700637


In [38]:
print(f"Report {classification_report(y_test, readmitance_predict_xbg)}")

Report               precision    recall  f1-score   support

           0       0.65      0.71      0.68     10474
           1       0.63      0.57      0.60      9151

    accuracy                           0.64     19625
   macro avg       0.64      0.64      0.64     19625
weighted avg       0.64      0.64      0.64     19625



In [39]:
diabetes_data.columns


Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [40]:
# Not impressive, not lets see what the confision matrix says
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = xgb_pipeline.predict(x_test)
label_encoder = LabelEncoder()

cm = confusion_matrix(y_test, y_pred)


print("Confusion Matrix:")
print(cm)



Confusion Matrix:
[[7410 3064]
 [3951 5200]]


