# Capstone 2 - Diabetes Patients’ Readmission Prediction

## Modeling 

#### Code written by: Rayees Ahamed 

**Steps:**
1. Importing necessary packages
2. Loading and verifying the scaled data
3. Training & test split
4. Model 1: Logistic Regression
5. Model 2: k-Nearest Neighbors (kNN)
6. Model 3: Random Forest Model
7. Hyperparameter tuning
8. Final model selection
9. Making pipelines
10. Summary & deployment

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [2]:
df = pd.read_csv('data/Diabetes_SOLVED_NEW.csv')
df.head()

Unnamed: 0,race,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses,max_glu_serum,A1Cresult,metformin,...,service_utilization_log1p,num_medications|time_in_hospital,num_medications|num_procedures,time_in_hospital|num_lab_procedures,num_medications|num_lab_procedures,num_medications|number_diagnoses,age_mean|number_diagnoses,change|num_medications,number_diagnoses|time_in_hospital,num_medications|numchange
0,Caucasian,0,3,59,0,18,9,-99,-99,0,...,0.0,54,0,177,1062,162,135.0,18,27,18
1,AfricanAmerican,0,2,11,5,13,6,-99,-99,0,...,1.386294,26,65,22,143,78,150.0,0,12,0
2,Caucasian,1,2,44,1,16,7,-99,-99,0,...,0.0,32,16,88,704,112,245.0,16,14,16
3,Caucasian,1,1,51,0,8,5,-99,-99,0,...,0.0,8,0,51,408,40,225.0,8,5,0
4,Caucasian,1,3,31,6,16,9,-99,-99,0,...,0.0,48,96,93,496,144,495.0,0,27,0


In [3]:
df.columns

Index(['race', 'gender', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'numchange', 'age_mean', 'nummed', 'number_inpatient_log1p',
       'number_emergency_log1p', 'number_outpatient_log1p',
       'service_utilization_log1p', 'num_medications|time_in_hospital',
       'num_medications|num_procedures', 'time_in_hospital|num_lab_procedures',
       'num_medications|num_lab_procedures',
       'num_medications|number_diagnoses', 'age_mean|number_diagnoses',
       'change|num_medications

## Preprocessing steps

In [4]:
# X variable assignment 
X = df.drop('readmitted', axis=1)

In [5]:
# Getting dummies
X = pd.get_dummies(X)

In [6]:
X.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
gender,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
time_in_hospital,3.0,2.0,2.0,1.0,3.0,4.0,5.0,13.0,12.0,9.0
num_lab_procedures,59.0,11.0,44.0,51.0,31.0,70.0,73.0,68.0,33.0,47.0
num_procedures,0.0,5.0,1.0,0.0,6.0,1.0,0.0,2.0,3.0,2.0
num_medications,18.0,13.0,16.0,8.0,16.0,21.0,12.0,28.0,18.0,17.0
number_diagnoses,9.0,6.0,7.0,5.0,9.0,7.0,8.0,8.0,8.0,9.0
max_glu_serum,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
A1Cresult,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0
metformin,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
repaglinide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Multiclass to binary class
df['readmitted'].value_counts()

0    64163
1     6250
Name: readmitted, dtype: int64

In [10]:
# y variable assignment 
y = df['readmitted']

## Train/Test split

In [14]:
# Train & test set split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=110, stratify=y)

In [15]:
# Verifying X shape
X_train.shape, X_test.shape

((52809, 51), (17604, 51))

In [16]:
# Verifying y shape
y_train.shape, y_test.shape

((52809,), (17604,))

In [18]:
# Checking datatype once
X_train.dtypes != 'object'

gender                                 True
time_in_hospital                       True
num_lab_procedures                     True
num_procedures                         True
num_medications                        True
number_diagnoses                       True
max_glu_serum                          True
A1Cresult                              True
metformin                              True
repaglinide                            True
nateglinide                            True
chlorpropamide                         True
glimepiride                            True
acetohexamide                          True
glipizide                              True
glyburide                              True
tolbutamide                            True
pioglitazone                           True
rosiglitazone                          True
acarbose                               True
miglitol                               True
troglitazone                           True
tolazamide                      

## Scaling the data

In [19]:
# Making a scaler object
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Verifying the scaling

In [20]:
# Verifying the mean of scaled_df (mean =0)
X_train_scaled.mean()

7.281505234728174e-18

In [21]:
# Verifying the standard deviation (std = 1)
X_test_scaled.std(ddof=0)

0.992539162888569

## Model 1: Logistic regression

In [22]:
lg_model = LogisticRegression()
lg_model.fit(X_train_scaled, y_train)

LogisticRegression()

In [23]:
y_pred1 = lg_model.predict(X_test_scaled)

In [24]:
print(accuracy_score(y_pred1, y_test))

0.9112133605998637


### Tuning the regularization strength (C-value)

In [25]:
Cs = [0.001, 0.1, 1, 10, 100]

# Looping c values
for c in Cs:
    lr_Cs = LogisticRegression(C=c, n_jobs=-1)
    lr_Cs.fit(X_train_scaled, y_train)
    y_pred_Cs = lr_Cs.predict(X_test_scaled)
    print('C value:', c, ' - average CV score: ', accuracy_score(y_pred_Cs, y_test))

C value: 0.001  - average CV score:  0.9112133605998637
C value: 0.1  - average CV score:  0.9112133605998637
C value: 1  - average CV score:  0.9112133605998637
C value: 10  - average CV score:  0.9112133605998637
C value: 100  - average CV score:  0.9112133605998637


### Assessing the performance using cross validation

In [26]:
# CV=10
cv_lg_model = cross_validate(lg_model, X_train_scaled, y_train, cv=10)

In [27]:
cv_lg_model['test_score']

array([0.91138042, 0.91119106, 0.91119106, 0.91119106, 0.91119106,
       0.91119106, 0.91119106, 0.91119106, 0.91119106, 0.91136364])

## Model 2: KNearest Neighbors Classifier

In [28]:
# Initial model (k=3)
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
knn.fit(X_train_scaled, y_train)
print('Train score:', knn.score(X_train_scaled, y_train))
print('Test score:', knn.score(X_test_scaled, y_test))

Train score: 0.9202787403662255
Test score: 0.8929788684389911


In [29]:
y_pred = knn.predict(X_test_scaled)

In [30]:
cr_knn = classification_report(y_test, y_pred)
print(cr_knn)

              precision    recall  f1-score   support

           0       0.91      0.98      0.94     16041
           1       0.12      0.03      0.05      1563

    accuracy                           0.89     17604
   macro avg       0.52      0.51      0.50     17604
weighted avg       0.84      0.89      0.86     17604



### Finding the right 'k' neighbors values

In [179]:
test_scores = []
train_scores = []

for k in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_train_scaled, y_train)
    
    train_scores.append(knn.score(X_train_scaled, y_train))
    test_scores.append(knn.score(X_test_scaled, y_test))

In [180]:
# Printing the scores according to k
knn_scores = pd.DataFrame({'train_scores':train_scores, 'test_scores':test_scores})
knn_scores['k'] = list(range(1,10))
knn_scores.set_index('k', inplace=True)
print(knn_scores)

   train_scores  test_scores
k                           
1      0.999813     0.543004
2      0.764233     0.551784
3      0.770665     0.553622
4      0.709920     0.559441
5      0.716369     0.556633
6      0.685979     0.563167
7      0.690522     0.566689
8      0.671941     0.566638
9      0.675957     0.565566


### Cross validation kNN

In [181]:
cv_knn = cross_validate(knn, X_train_scaled, y_train, cv=10)

In [183]:
print('CV-kNN, Test scores:', cv_knn['test_score'])

CV-kNN, Test scores: [0.56168113 0.56185128 0.56236175 0.57699507 0.54415518 0.58124894
 0.5746129  0.56678578 0.55946912 0.57291135]


In [184]:
y_pred = knn.predict(X_test_scaled)

In [185]:
cr_knn = classification_report(y_test, y_pred)
print(cr_knn)

              precision    recall  f1-score   support

          NO       0.57      0.64      0.61     10207
         YES       0.55      0.48      0.52      9384

    accuracy                           0.57     19591
   macro avg       0.56      0.56      0.56     19591
weighted avg       0.56      0.57      0.56     19591



### Grid Search CV - kNN

In [186]:
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_gridCV = GridSearchCV(knn, param_grid, cv=5)
knn_gridCV.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})

In [187]:
print('Best score:' + str(knn_gridCV.best_score_))
print('Best param:' + str(knn_gridCV.best_params_))

Best score:0.5780500255232262
Best param:{'n_neighbors': 49}
