In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, precision_recall_curve, classification_report, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score
import time
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/Diabetes_Preprocessed_Before_Feature_Selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,0,5,Not Available,Referral,1,41,0,1,Diabetes,Not Required,Not Required,1,,,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
1,Caucasian,0,15,Discharged to home,Emergency,3,59,0,18,"Endocrine, Nutritional, Metabolic, Immunity",Diabetes,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,-2,-2,-2,-2,1,1,0,0,1,insulin_only
2,AfricanAmerican,0,25,Discharged to home,Emergency,2,11,5,13,"Pregnancy, Childbirth",Diabetes,External causes of injury,6,,,-2,-2,-2,0,-2,-2,-2,-2,1,0,3,0,other_meds
3,Caucasian,1,35,Discharged to home,Emergency,2,44,1,16,Infectious and Parasitic,Diabetes,Circulatory,7,,,-2,-2,-2,-2,-2,-2,-2,1,1,0,0,1,insulin_only
4,Caucasian,1,45,Discharged to home,Emergency,1,51,0,8,Neoplasms,Neoplasms,Diabetes,5,,,-2,-2,-2,0,-2,-2,-2,0,1,0,0,0,insulin_combo


In [None]:
df.shape

(97070, 28)

In [None]:
X = df.drop('readmitted', 1)
y = df['readmitted']
X_dum = pd.get_dummies(X, drop_first = True)
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size = 0.3, stratify = y, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((67949, 95), (29121, 95), (67949,), (29121,))

### Default Decision Tree with class weight

In [None]:
model = DecisionTreeClassifier(random_state = 0, class_weight = 'balanced')
model.fit(X_train, y_train)
sorted(zip(model.feature_importances_, X_train.columns), reverse = True)

[(0.12365442164036872, 'num_lab_procedures'),
 (0.11372636536368964, 'num_medications'),
 (0.06390367863223807, 'time_in_hospital'),
 (0.05042546073138741, 'preceding_year_visits'),
 (0.04697597986166735, 'age'),
 (0.04083221673070613, 'num_procedures'),
 (0.04053462174770501, 'number_diagnoses'),
 (0.019795677948474092, 'insulin'),
 (0.019714341130940054,
  'discharge_disposition_id_Transferred to another medical facility'),
 (0.015839912478576138, 'race_Caucasian'),
 (0.014140805503156769, 'diag_3_Circulatory'),
 (0.013267240400954844, 'admission_source_id_Referral'),
 (0.013253323481257273, 'diag_1_Circulatory'),
 (0.012905064547135187, 'diag_2_Circulatory'),
 (0.012632933443806543, 'gender'),
 (0.012450851695309047, 'number_changes'),
 (0.01171835396610171, 'glyburide'),
 (0.011100828594193526, 'diag_3_Endocrine, Nutritional, Metabolic, Immunity'),
 (0.010623295164622026, 'diag_3_Diabetes'),
 (0.010368378921011468, 'diag_1_Respiratory'),
 (0.010047383644997531, 'metformin'),
 (0.00

In [None]:
fi_df = pd.DataFrame({"Feature" : X_train.columns, "Importance" : model.feature_importances_})
imp = fi_df[fi_df['Importance'] >= 0]['Feature']
len(imp)

95

In [None]:
model = DecisionTreeClassifier(random_state = 0, class_weight = 'balanced')
start_time = time.time()
model.fit(X_train[imp], y_train)
end_time = time.time()
print("Training Time:", end_time - start_time)
y_train_pred = model.predict(X_train[imp])
y_train_prob = model.predict_proba(X_train[imp])[:, 1]
start_time = time.time()
y_test_pred = model.predict(X_test[imp])
end_time = time.time()
print("Prediction Time:", end_time - start_time)
y_test_prob = model.predict_proba(X_test[imp])[:, 1]
print("Train Accuracy Score:", accuracy_score(y_train, y_train_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred), '\n')
print("Train F1 Score:", f1_score(y_train, y_train_pred))
print("Train Precision Score:", precision_score(y_train, y_train_pred))
print("Train Recall Score:", recall_score(y_train, y_train_pred))
print("Train ROC_AUC Score:", roc_auc_score(y_train, y_train_prob))
print()
print("Test Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred), '\n')
print("Test F1 Score:", f1_score(y_test, y_test_pred))
print("Test Precision Score:", precision_score(y_test, y_test_pred))
print("Test Recall Score:", recall_score(y_test, y_test_pred))
print("Test ROC_AUC Score:", roc_auc_score(y_test, y_test_prob))

Training Time: 1.3760809898376465
Prediction Time: 0.02542257308959961
Train Accuracy Score: 0.999985283079957
Train Confusion Matrix:
 [[60164     1]
 [    0  7784]] 

Train F1 Score: 0.9999357697989595
Train Precision Score: 0.9998715478484265
Train Recall Score: 1.0
Train ROC_AUC Score: 0.9999999989323639

Test Accuracy Score: 0.797980838570104
Test Confusion Matrix:
 [[22649  3136]
 [ 2747   589]] 

Test F1 Score: 0.1668318934995043
Test Precision Score: 0.1581208053691275
Test Recall Score: 0.17655875299760193
Test ROC_AUC Score: 0.5274688277301371
