In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
pd.set_option("max_columns", 50)

diab_final = pd.read_csv("data/clean_data_2class.csv")

from sklearn.model_selection import train_test_split

# columns need to be deleted since they are not adding any value
del_cols = ["Unnamed: 0", "encounter_id", "patient_nbr", "weight", "discharge_disposition", 
            "admission_source", "payer_code", "medical_specialty", "admisison_type"]

for col in del_cols:
    if col in diab_final.columns:
        diab_final.drop(col, inplace=True, axis = 1)
        #del diab_final[col]

In [2]:
from xgboost import XGBClassifier



In [3]:
# list selected from feature importance from random forest
pick_list = ["number_inpatient", "num_medications", "number_emergency", "number_outpatient", "num_lab_procedures", 
             "number_diagnoses", "age", "time_in_hospital", "num_procedures", "diag_1_NP", "admission_type_HMO Referral", 
             "diabetesMed_Yes", "admission_type_Transfer from another health care facility", "admission_type_Physician Referral", 
             "gender_Male", "diag_2_CSD", "metformin_No", "race_Caucasian", "gender_Female", "insulin_Steady", "insulin_No", 
             "metformin_Steady", "diag_1_ENMI", "diag_3_ENMI", "A1Cresult_None", "admission_type_Clinic Referral", "diag_1_CSD", 
             "diag_3_CSD", "diag_2_ENMI", "diabetesMed_No", "insulin_Down", "change_Ch", "diag_1_PCC", "race_AfricanAmerican", 
             "change_No", "max_glu_serum_None", "diag_2_RSD", "diag_2_SSD"]

In [4]:
predictors = diab_final.drop("readmitted", axis = 1)
outcome = diab_final["readmitted"]
dummy_predictor = pd.get_dummies(predictors)
dummy_predictor = dummy_predictor[pick_list]

In [16]:
# X_train, X_test, Y_train, Y_test = train_test_split(dummy_predictor,
#                                                    outcome,
#                                                    stratify=outcome,
#                                                    test_size=0.2,
#                                                    random_state = 0)

In [24]:
# clf = XGBClassifier(max_depth=20, n_estimators=900, learning_rate=0.15)
# clf.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.15, max_delta_step=0, max_depth=20,
       min_child_weight=1, missing=None, n_estimators=900, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [25]:
# model.score(X_train, Y_train)

0.63095829986869112

In [26]:
# model.score(X_test, Y_test)

0.62057008821579729

In [27]:
# # for alarm sound
# import winsound
# duration = 1000  # millisecond
# freq = 500  # Hz
# winsound.Beep(freq, duration)
# winsound.Beep(freq, duration)

In [44]:
num_trees = range(10,100)
depth_range = range(4,10)

tuning_parameter = []

prev_error = 1
test_size_range = np.arange(.4,0,-0.05)
for test_size in test_size_range:
    X_train, X_test, Y_train, Y_test = train_test_split(dummy_predictor,
                                                   outcome,
                                                   stratify=outcome,
                                                   test_size=test_size,
                                                   random_state = 0)
    for num in num_trees:
        for depth in depth_range:
            clf = XGBClassifier(max_depth=depth, n_estimators=num, learning_rate=0.15)
            clf.fit(X_train, Y_train)
#             rndm_pred = clf.predict(X_test)
#         cm = confusion_matrix(Y_test, rndm_pred)
#         clf_rate = (cm[0,0]+cm[1,1])/sum(sum(cm))
            clf_test = clf.score(X_test,Y_test)
            clf_train = clf.score(X_train,Y_train)
            tuning_parameter.append([1 - test_size, num, depth, clf_train, clf_test])

KeyboardInterrupt: 

In [45]:
tuning_df = pd.DataFrame(tuning_parameter)
tuning_df.columns = ["training_size", "number of trees", "tree depth", "training efficiency", "testing efficiency"]

In [46]:
tuning_df.to_csv("data/Boosted.csv")