In [1]:
import pandas as pd
import numpy as np
pd.set_option("max_columns", 50)

diab_final = pd.read_csv("data/clean_data_2class.csv")

# columns need to be deleted since they are not adding any value
del_cols = ["Unnamed: 0", "encounter_id", "patient_nbr", "weight", "discharge_disposition", 
            "admission_source", "payer_code", "medical_specialty", "admisison_type"]

for col in del_cols:
    if col in diab_final.columns:
        diab_final.drop(col, inplace=True, axis = 1)
        #del diab_final[col]

In [2]:
diab_final.head()

Unnamed: 0,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,admission_type
0,Caucasian,Female,1,3,59,0,18,0,0,0,ENMI,ENMI,ENMI,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,YES,Physician Referral
1,AfricanAmerican,Female,2,2,11,5,13,2,0,1,PCC,ENMI,CFI,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO,Physician Referral
2,Caucasian,Male,3,2,44,1,16,0,0,0,IPD,ENMI,CSD,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,Physician Referral
3,Caucasian,Male,4,1,51,0,8,0,0,0,NP,NP,ENMI,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,Physician Referral
4,Caucasian,Male,5,3,31,6,16,0,0,0,CSD,CSD,ENMI,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,YES,Clinic Referral


In [3]:
# for stratified training-testing split
from sklearn.model_selection import train_test_split
# for feature selection
from sklearn.feature_selection import SelectFromModel
# training a DescisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
predictors = diab_final.drop("readmitted", axis = 1)
outcome = diab_final["readmitted"]

In [5]:
# list selected from feature importance from random forest
pick_list = ["number_inpatient", "num_medications", "number_emergency", "number_outpatient", "num_lab_procedures", 
             "number_diagnoses", "age", "time_in_hospital", "num_procedures", "diag_1_NP", "admission_type_HMO Referral", 
             "diabetesMed_Yes", "admission_type_Transfer from another health care facility", "admission_type_Physician Referral", 
             "gender_Male", "diag_2_CSD", "metformin_No", "race_Caucasian", "gender_Female", "insulin_Steady", "insulin_No", 
             "metformin_Steady", "diag_1_ENMI", "diag_3_ENMI", "A1Cresult_None", "admission_type_Clinic Referral", "diag_1_CSD", 
             "diag_3_CSD", "diag_2_ENMI", "diabetesMed_No", "insulin_Down", "change_Ch", "diag_1_PCC", "race_AfricanAmerican", 
             "change_No", "max_glu_serum_None", "diag_2_RSD", "diag_2_SSD"]

In [6]:
# create dummy variables
dummy_predictor = pd.get_dummies(predictors)
dummy_predictor = dummy_predictor[pick_list]

In [7]:
# dummy_predictor.to_csv("data/dummy_var.csv")

In [8]:
tuning_parameter = []
prev_error = 1
test_size_range = np.arange(.4,0,-0.03)
for test_size in test_size_range:
    X_train, X_test, Y_train, Y_test = train_test_split(dummy_predictor,
                                                   outcome,
                                                   stratify=outcome,
                                                   test_size=test_size,
                                                   random_state = 0)
#     dtree_depthVsPrediction[1-test_size] = []
    for i in range(1,10):
        dtree_model = DecisionTreeClassifier(max_depth = i).fit(X_train, Y_train)
#         dtree_predictions = dtree_model.predict(X_test)
        #cm = confusion_matrix(Y_test, dtree_predictions)
#         print("Training size:%.2f Tree depth:%d, Prediction Accuracy:%.2f" % (1-test_size,i,dtree_model.score(X_test, Y_test)*100))
        clf_train = dtree_model.score(X_train, Y_train)
        clf_test = dtree_model.score(X_test, Y_test)
        tuning_parameter.append([round(1-test_size,2),i, clf_train, clf_test])

In [17]:
tuning_df = pd.DataFrame(tuning_parameter)

In [22]:
tuning_df.columns = ["training_size", "tree depth", "training efficiency", "testing efficiency"]

In [23]:
max(tuning_df['testing efficiency'])

0.61771305021825151

In [24]:
tuning_df.shape

(126, 4)

In [26]:
tuning_df.to_csv("data/DTree.csv")