In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp

In [None]:
file = 'diabetes_data_preprocessed_dropped_duplicates_1.csv'
df = pd.read_csv(file, sep=',')

In [None]:
race_dict = {'Asian':0, 'Caucasian':1,'AfricanAmerican':2,'Hispanic':3,'Other':4,'?':5}
df["race"].replace(race_dict, inplace=True)


In [None]:
df1 =df.drop([df.columns[0],df.columns[1],df.columns[2]],axis=1)


In [None]:
feature_set_1 = ['race',
 'gender',
 'age',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'diag_1',
 'diag_2',
 'diag_3',
 'number_diagnoses',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'service_utilization',
 'numchange',
 'nummed',
 'level1_diag1',
 'number_emergency_log',
 'number_emergency_log1p',
 'number_inpatient_log',
 'number_inpatient_log1p',
 'number_outpatient_log',
 'number_outpatient_log1p',
 'service_utilization_log',
 'service_utilization_log1p',
 'readmitted_log']


In [None]:
# change feature set
feature_set_2 = ['race','age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures','change', 'service_utilization_log1p', 'number_diagnoses','num_medications','number_outpatient','number_emergency','number_inpatient']
train_input = df1[feature_set_2]
train_output = df1['readmitted']
from sklearn.model_selection import train_test_split
# shuffles, %80 training, %20 test

In [None]:
########################################################

In [None]:
###########################
### Logistic regression ###
###########################
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

X_train, X_dev, Y_train, Y_dev = train_test_split(train_input, \
                                                  train_output, \
                                                  test_size=0.20, \
                                                  random_state=0)
logit = sm.Logit(Y_train, X_train)
result = logit.fit()
print(result.summary())

In [None]:
# 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

X_train, X_dev, Y_train, Y_dev = train_test_split(train_input, train_output, test_size=0.20, random_state=0)
logreg = LogisticRegression(fit_intercept=True, penalty='l1')
print("Cross Validation Score: {:.2%}".format(np.mean(cross_val_score(logreg, X_train, Y_train, cv=10))))
logreg.fit(X_train, Y_train)
print("Dev Set score: {:.2%}".format(logreg.score(X_dev, Y_dev)))

In [None]:
# Confusion matrix
Y_dev_predict = logreg.predict(X_dev)
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(Y_dev, Y_dev_predict)))

In [None]:
##########################
##### Decision Trees #####
##########################

# Data balancing applied using SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter
print('Original dataset shape {}'.format(Counter(train_output)))

smt = SMOTE(random_state=20)
train_input_new, train_output_new = smt.fit_sample(train_input, train_output)
print('New dataset shape {}'.format(Counter(train_output_new)))
train_input_new = pd.DataFrame(train_input_new, columns = list(train_input.columns))
X_train, X_dev, Y_train, Y_dev = train_test_split(train_input_new, train_output_new, test_size=0.20, random_state=0)


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Can change with gini
# grid search?
dte = DecisionTreeClassifier(max_depth=28, criterion = "entropy", min_samples_split=10)
print("Cross Validation score: {:.2%}".format(np.mean(cross_val_score(dte, X_train, Y_train, cv=10))))
dte.fit(X_train, Y_train)
print("Dev Set score: {:.2%}".format(dte.score(X_dev, Y_dev)))

In [None]:
Y_dev_predict = dte.predict(X_dev)
pd.crosstab(pd.Series(Y_dev, name = 'Actual'), pd.Series(Y_dev_predict, name = 'Predict'), margins = True)
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
print("Accuracy is {0:.2f}".format(accuracy_score(Y_dev, Y_dev_predict)))
print("Precision is {0:.2f}".format(precision_score(Y_dev, Y_dev_predict)))
print("Recall is {0:.2f}".format(recall_score(Y_dev, Y_dev_predict)))
print("AUC is {0:.2f}".format(roc_auc_score(Y_dev, Y_dev_predict)))

In [None]:
import graphviz
from IPython.display import Image
import pydotplus
from sklearn import tree

dot_dt_q2 = tree.export_graphviz(dte, out_file="dt_q2.dot", feature_names=X_train.columns, max_depth=2, class_names=["No","Readm"], filled=True, rounded=True, special_characters=True)
graph_dt_q2 = pydotplus.graph_from_dot_file('dt_q2.dot')
Image(graph_dt_q2.create_png())

In [None]:
# Most important features
# Shot top most features based on importance
feature_names = X_train.columns
feature_imports = dte.feature_importances_
most_imp_features = pd.DataFrame([f for f in zip(feature_names,feature_imports)], columns=["Feature", "Importance"]).nlargest(10, "Importance")
most_imp_features.sort_values(by="Importance", inplace=True)

plt.figure(figsize=(10,6))
plt.barh(range(len(most_imp_features)), most_imp_features.Importance, align='center', alpha=0.8)
plt.yticks(range(len(most_imp_features)), most_imp_features.Feature, fontsize=14)
plt.xlabel('Importance')
plt.title('Most important features - Decision Tree (entropy) (Question 2 - complex model)')
plt.show()