# Importing required packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

%config Completer.use_jedi = False

In [None]:
df = pd.read_csv("../input/diabetes-readmission-dataset/dataset_diabetes/diabetic_data.csv")
df.sample(10)

In [None]:
df.info()

In [None]:
# target column

df.groupby('readmitted').size()

In [None]:
df = df.loc[~df.discharge_disposition_id.isin([11, 13, 14, 19, 20, 21])]
df

In [None]:
df['OUTPUT_LABEL'] = (df.readmitted == '<30').astype('int')
df[['OUTPUT_LABEL']].sample(10)

In [None]:
# function for calculating prevalance of population that is readmitted within 30 days

def cal_prevalance(y_actual):
    return (sum(y_actual)/len(y_actual))

print(f"Prevalance : {round(cal_prevalance(df.OUTPUT_LABEL.values)*100, 3)} %")

In [None]:
# Replacing '?' with NAN
df = df.replace('?', np.nan)
df.info()

In [None]:
df.columns

In [None]:
# Collecting Numerical columns
num_cols = [
    'time_in_hospital', 'num_lab_procedures',
    'num_procedures', 'num_medications',
    'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses'
]
print('ColName              NullCount')
print('=============================')
df[num_cols].isnull().sum()

In [None]:
# Collecting categorical columns
cat_cols = [
    'race', 'gender', 
    'max_glu_serum', 'A1Cresult',
    'metformin', 'repaglinide', 
    'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide',
    'glipizide', 'glyburide',
    'tolbutamide', 'pioglitazone',
    'rosiglitazone', 'acarbose',
    'miglitol', 'troglitazone',
    'tolazamide', 'insulin',
    'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone',
    'metformin-pioglitazone', 'change',
    'diabetesMed','payer_code'
]

print('ColName                    NullCount')
print('======================================')
df[cat_cols].isnull().sum()


In [None]:
df['race'] = df['race'].fillna('UNK')
df['paper_code'] = df['payer_code'].fillna('UNK')
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')

print(f"# medical specialty : {df.medical_specialty.nunique()}")
df.groupby('medical_specialty').size().sort_values(ascending = False)

In [None]:
top_10 = [
    'UNK','InternalMedicine',
    'Emergency/Trauma', 'Family/GeneralPractice',
    'Cardiology', 'Surgery-General' , 
    'Nephrology', 'Orthopedics',
    'Orthopedics-Reconstructive', 'Radiologist'
]

df['med_spec'] = df['medical_specialty'].copy()
df.loc[~df.med_spec.isin(top_10), 'med_spec'] = 'Other'

df.groupby('med_spec').size()

In [None]:
cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

df[cols_cat_num] = df[cols_cat_num].astype('str')
df_cat = pd.get_dummies(df[cat_cols + cols_cat_num + ['med_spec']], drop_first = True)

df_cat.sample(5)

In [None]:
df = pd.concat([df, df_cat], axis = 1)

# To keep track of the categorical columns
cols_all_cat = list(df_cat.columns)

In [None]:
# Using 'age' and 'weight'

df[['age', 'weight']].sample(5)

In [None]:
df.groupby('age').size()

In [None]:
age_id = {'[0-10)':0, 
          '[10-20)':10, 
          '[20-30)':20, 
          '[30-40)':30, 
          '[40-50)':40, 
          '[50-60)':50,
          '[60-70)':60, 
          '[70-80)':70, 
          '[80-90)':80, 
          '[90-100)':90}
df['age_group'] = df.age.replace(age_id)

In [None]:
# Since 'weight' has lots of NAN values, it is better to use it as whether the paitent_ID has enrolled his/her weight or not
df["has_weight"] = df.weight.notnull().astype('int')

extra_cols = ["age_group", "has_weight"]

Upto this we have created 

* Numerical features : 8
* Categorical features : 132
* Extra features : 2

In [None]:
cols2use = num_cols + cols_all_cat + extra_cols
df_data = df[cols2use + ['OUTPUT_LABEL']]  

# Building Training, Validation, Test Sets


In [None]:
cols2use = [
    'number_inpatient',
    'discharge_disposition_id_22',
    'number_emergency',
    'number_diagnoses',
    'num_medications',
    'time_in_hospital',
    'num_lab_procedures',
    'insulin_No',
    'age_group',
    'number_outpatient',
    'discharge_disposition_id_3',
    'num_procedures',
]

df_data = df[cols2use + ['OUTPUT_LABEL']] 

In [None]:
# Split 70:15:15, type = Stratify
# Shuffling the dataset with random state 42

df_data = df_data.sample(n = len(df_data), random_state = 42)
df_data = df_data.reset_index(drop = True)

In [None]:
df_valid_test = df_data.sample(frac = 0.3, random_state = 42)
df_test = df_valid_test.sample(frac = 0.3, random_state = 42)
df_valid = df_valid_test.drop(df_test.index)
df_train_all = df_data.drop(df_valid_test.index)

In [None]:
print('Test prevalence(n = %d):%.3f'%(len(df_test),cal_prevalance(df_test.OUTPUT_LABEL.values)))
print('Valid prevalence(n = %d):%.3f'%(len(df_valid),cal_prevalance(df_valid.OUTPUT_LABEL.values)))
print('Train all prevalence(n = %d):%.3f'%(len(df_train_all), cal_prevalance(df_train_all.OUTPUT_LABEL.values)))

In [None]:
# split the training data into positive and negative
rows_pos = df_train_all.OUTPUT_LABEL == 1
df_train_pos = df_train_all.loc[rows_pos]
df_train_neg = df_train_all.loc[~rows_pos]

# merge the balanced data
df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

# shuffle the order of training samples 
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop = True)

print('Train balanced prevalence(n = %d):%.3f'%(len(df_train), cal_prevalance(df_train.OUTPUT_LABEL.values)))

In [None]:
X_train = df_train[cols2use].values
X_train_all = df_train_all[cols2use].values
X_valid = df_valid[cols2use].values

y_train = df_train['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values

print('Training All shapes:',X_train_all.shape)
print('Training shapes:',X_train.shape, y_train.shape)
print('Validation shapes:',X_valid.shape, y_valid.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
import pickle

ss = StandardScaler()
ss.fit(X_train_all)

scalerfile = 'StndSclr.sav'
pickle.dump(ss, open(scalerfile, 'wb'))

# load it back
ss = pickle.load(open(scalerfile, 'rb'))

X_train_tf = ss.transform(X_train)
X_valid_tf = ss.transform(X_valid)

# Building and testing models

In [None]:
X_train

In [None]:
# Creating helper functions
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh):
    
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%cal_prevalance(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

In [None]:
thresh = 0.5

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors = 100)
knn.fit(X_train_tf, y_train)

y_train_preds = knn.predict_proba(X_train_tf)[:,1]
y_valid_preds = knn.predict_proba(X_valid_tf)[:,1]

print('KNN')
print('Training:')
knn_train_auc, knn_train_accuracy, knn_train_recall, \
knn_train_precision, knn_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
knn_valid_auc, knn_valid_accuracy, knn_valid_recall, \
knn_valid_precision, knn_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state = 42)
lr.fit(X_train_tf, y_train)

y_train_preds = lr.predict_proba(X_train_tf)[:,1]
y_valid_preds = lr.predict_proba(X_valid_tf)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
# Navie Bayes
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train_tf, y_train)

y_train_preds = nb.predict_proba(X_train_tf)[:,1]
y_valid_preds = nb.predict_proba(X_valid_tf)[:,1]

print('Naive Bayes')
print('Training:')
nb_train_auc, nb_train_accuracy, nb_train_recall, nb_train_precision, nb_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
nb_valid_auc, nb_valid_accuracy, nb_valid_recall, nb_valid_precision, nb_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth = 10, random_state = 42)
tree.fit(X_train_tf, y_train)

y_train_preds = tree.predict_proba(X_train_tf)[:,1]
y_valid_preds = tree.predict_proba(X_valid_tf)[:,1]

print('Decision Tree')
print('Training:')
tree_train_auc, tree_train_accuracy, tree_train_recall, tree_train_precision, tree_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
tree_valid_auc, tree_valid_accuracy, tree_valid_recall, tree_valid_precision, tree_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(max_depth = 6, random_state = 42)
rf.fit(X_train_tf, y_train)

y_train_preds = rf.predict_proba(X_train_tf)[:,1]
y_valid_preds = rf.predict_proba(X_valid_tf)[:,1]

print('Random Forest')
print('Training:')
rf_train_auc, rf_train_accuracy, rf_train_recall, rf_train_precision, rf_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
rf_valid_auc, rf_valid_accuracy, rf_valid_recall, rf_valid_precision, rf_valid_specificity = print_report(y_valid,y_valid_preds, thresh)

In [None]:
df_results = pd.DataFrame({'classifier':['KNN','KNN','LR','LR','NB','NB','DT','DT','RF','RF',],
                           'data_set':['train','valid']*5,
                          'auc':[knn_train_auc, knn_valid_auc,lr_train_auc,lr_valid_auc,nb_train_auc,nb_valid_auc,tree_train_auc,tree_valid_auc,rf_train_auc,rf_valid_auc,],
                          'accuracy':[knn_train_accuracy, knn_valid_accuracy,lr_train_accuracy,lr_valid_accuracy,nb_train_accuracy,nb_valid_accuracy,tree_train_accuracy,tree_valid_accuracy,rf_train_accuracy,rf_valid_accuracy,],
                          'recall':[knn_train_recall, knn_valid_recall,lr_train_recall,lr_valid_recall,nb_train_recall,nb_valid_recall,tree_train_recall,tree_valid_recall,rf_train_recall,rf_valid_recall,],
                          'precision':[knn_train_precision, knn_valid_precision,lr_train_precision,lr_valid_precision,nb_train_precision,nb_valid_precision,tree_train_precision,tree_valid_precision,rf_train_precision,rf_valid_precision,],
                          'specificity':[knn_train_specificity, knn_valid_specificity,lr_train_specificity,lr_valid_specificity,nb_train_specificity,nb_valid_specificity,tree_train_specificity,tree_valid_specificity,rf_train_specificity,rf_valid_specificity,]})

df_results

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")

In [None]:
plt.figure(figsize=(18, 7))
ax = sns.barplot(x="classifier", y="auc", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('AUC', fontsize = 15)
ax.tick_params(labelsize=15)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize = 15)
plt.show()
plt.savefig("Model Comparision.jpeg")

In [None]:
import numpy as np
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("AUC")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring = 'roc_auc')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="b")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="b",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
title = "Learning Curves (Random Forest)"
# Cross validation with 5 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
estimator = RandomForestClassifier(max_depth = 6, random_state = 42)
plot_learning_curve(estimator, title, X_train_tf, y_train, ylim=(0.2, 1.01), cv=cv, n_jobs=4)

plt.show()

# Feature Importance

### LR

In [None]:
feature_importances = pd.DataFrame(lr.coef_[0],
                                   index = cols2use,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)

feature_importances.head()

In [None]:
num = 12
ylocs = np.arange(num)
# get the feature importance for top num and sort in reverse order
values_to_plot = feature_importances.iloc[:num].values.ravel()[::-1]
feature_labels = list(feature_importances.iloc[:num].index)[::-1]

plt.figure(num=None, figsize=(8, 15), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Positive Feature Importance Score - Logistic Regression')
plt.yticks(ylocs, feature_labels)
plt.show()

In [None]:
values_to_plot = feature_importances.iloc[-num:].values.ravel()
feature_labels = list(feature_importances.iloc[-num:].index)

plt.figure(num=None, figsize=(8, 15), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Negative Feature Importance Score - Logistic Regression')
plt.yticks(ylocs, feature_labels)
plt.show()

### RFC

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = cols2use,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)

feature_importances.head()

In [None]:
num = 12
ylocs = np.arange(num)
# get the feature importance for top num and sort in reverse order
values_to_plot = feature_importances.iloc[:num].values.ravel()[::-1]
feature_labels = list(feature_importances.iloc[:num].index)[::-1]

plt.figure(num=None, figsize=(8, 15), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Feature Importance Score - Random Forest')
plt.yticks(ylocs, feature_labels)
plt.show()

In [None]:
feature_importances[:12]

In [None]:
imp_features = [
    'number_inpatient',
    'discharge_disposition_id_22',
    'number_emergency',
    'number_diagnoses',
    'num_medications',
    'time_in_hospital',
    'num_lab_procedures',
    'insulin_No',
    'age_group',
    'number_outpatient',
    'discharge_disposition_id_3',
    'num_procedures',
]

# Hyperparamet Tuning

In [None]:
rf.get_params()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# number of trees
n_estimators = range(200,1000,200)
# maximum number of features to use at each split
max_features = ['auto','sqrt']
# maximum depth of the tree
max_depth = range(1,10,1)
# minimum number of samples to split a node
min_samples_split = range(2,10,2)
# criterion for evaluating a split
criterion = ['gini','entropy']

# random grid

random_grid = {'n_estimators':n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'criterion':criterion}

print(random_grid)

In [None]:
from sklearn.metrics import make_scorer, roc_auc_score
auc_scoring = make_scorer(roc_auc_score)

# create the randomized search cross-validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 20, cv = 2, scoring=auc_scoring,
                               verbose = 1, random_state = 42)

In [None]:
# fit the random search model (this will take a few minutes)
t1 = time.time()
rf_random.fit(X_train_tf, y_train)
t2 = time.time()
print(t2-t1)

rf_random.best_params_

In [None]:
y_train_preds = rf.predict_proba(X_train_tf)[:,1]
y_valid_preds = rf.predict_proba(X_valid_tf)[:,1]

print('Baseline Random Forest')
rf_train_auc_base = roc_auc_score(y_train, y_train_preds)
rf_valid_auc_base = roc_auc_score(y_valid, y_valid_preds)

print('Training AUC:%.3f'%(rf_train_auc_base))
print('Validation AUC:%.3f'%(rf_valid_auc_base))

print('Optimized Random Forest')
y_train_preds_random = rf_random.best_estimator_.predict_proba(X_train_tf)[:,1]
y_valid_preds_random = rf_random.best_estimator_.predict_proba(X_valid_tf)[:,1]

rf_train_auc = roc_auc_score(y_train, y_train_preds_random)
rf_valid_auc = roc_auc_score(y_valid, y_valid_preds_random)

print('Training AUC:%.3f'%(rf_train_auc))
print('Validation AUC:%.3f'%(rf_valid_auc))

# Downloading Model

In [None]:
pickle.dump(rf_random.best_estimator_, open('reAdmissionDiabeticsModel.pkl', 'wb'),protocol = 4)

# Model Evaluation

In [None]:
X_test = df_test[cols2use].values
y_test = df_test['OUTPUT_LABEL'].values

scaler = pickle.load(open('./StndSclr.sav', 'rb'))
X_test_tf = scaler.transform(X_test)

best_model = pickle.load(open('./reAdmissionDiabeticsModel.pkl','rb'))

In [None]:
df_test[cols2use]

In [None]:
y_train_preds = best_model.predict_proba(X_train_tf)[:,1]
y_valid_preds = best_model.predict_proba(X_valid_tf)[:,1]
y_test_preds = best_model.predict_proba(X_test_tf)[:,1]

thresh = 0.5

print('Training:')
train_auc, train_accuracy, train_recall, train_precision, train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
valid_auc, valid_accuracy, valid_recall, valid_precision, valid_specificity = print_report(y_valid,y_valid_preds, thresh)
print('Test:')
test_auc, test_accuracy, test_recall, test_precision, test_specificity = print_report(y_test,y_test_preds, thresh)

In [None]:
from sklearn.metrics import roc_curve 

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_preds)
auc_train = roc_auc_score(y_train, y_train_preds)

fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_preds)
auc_valid = roc_auc_score(y_valid, y_valid_preds)

fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_preds)
auc_test = roc_auc_score(y_test, y_test_preds)

plt.figure(figsize = (20, 7))
plt.plot(fpr_train, tpr_train, 'r-',label ='Train AUC:%.3f'%auc_train)
plt.plot(fpr_valid, tpr_valid, 'b-',label ='Valid AUC:%.3f'%auc_valid)
plt.plot(fpr_test, tpr_test, 'g-',label ='Test AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
cols2use

In [None]:
vals = []
for v in range(len(cols2use)):
    vals.append([])
vals

In [None]:
dict(zip(cols2use, vals))

In [None]:
np.array(cols2use)

In [None]:
X_test[0]

In [None]:
X_test_tf[0]