# Capstone 2 - Diabetes Patients’ Readmission Prediction

## Modeling 

#### Code written by: Rayees Ahamed 

**Steps:**
1. Importing necessary packages
2. Loading and verifying the scaled data
3. Training & test split
4. Model 1: Logistic Regression
5. Model 2: k-Nearest Neighbors (kNN)
6. Model 3: Random Forest Model
7. Hyperparameter tuning
8. Final model selection
9. Making pipelines
10. Summary & deployment

## Imports

In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [149]:
Diabetes = pd.read_csv('data/Diabetes_processed_NEW.csv')
Diabetes.head()

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,readmitted,age_mean
0,Female,3,59,0,18,0,0,0,9,,...,No,Up,No,No,No,No,No,Yes,>30,15.0
1,Female,2,11,5,13,2,0,1,6,,...,No,No,No,No,No,No,No,No,NO,25.0
2,Male,2,44,1,16,0,0,0,7,,...,No,Up,No,No,No,No,No,Yes,NO,35.0
3,Male,1,51,0,8,0,0,0,5,,...,No,Steady,No,No,No,No,No,Yes,NO,45.0
4,Male,3,31,6,16,0,0,0,9,,...,No,Steady,No,No,No,No,No,No,>30,55.0


In [150]:
Diabetes.columns

Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'readmitted', 'age_mean'],
      dtype='object')

## Preprocessing steps

In [151]:
# X variable assignment 
X = Diabetes.drop('readmitted', axis=1)

In [152]:
# Getting dummies
X = pd.get_dummies(X)

In [153]:
# Multiclass to binary class
Diabetes['readmitted'].unique()

array(['>30', 'NO', '<30'], dtype=object)

In [158]:
df_copy = Diabetes

In [160]:
df_copy['readmitted'].replace(to_replace='>30', value='YES', inplace=True)
df_copy['readmitted'].replace(to_replace='<30', value='YES', inplace=True)
df_copy.readmitted.value_counts() / len(df_copy.readmitted) * 100

NO     52.235168
YES    47.764832
Name: readmitted, dtype: float64

In [161]:
y_copy = df_copy['readmitted']

In [162]:
# y variable assignment 
y = Diabetes['readmitted']

## Train/Test split

In [163]:
# Train & test set split
X_train, X_test, y_train, y_test = train_test_split(X, y_copy, test_size=0.25, random_state=111)

In [164]:
# Verifying X shape
X_train.shape, X_test.shape

((58770, 92), (19591, 92))

In [165]:
# Verifying y shape
y_train.shape, y_test.shape

((58770,), (19591,))

In [166]:
# Checking datatype once
X_train.dtypes != 'object'

time_in_hospital                  True
num_lab_procedures                True
num_procedures                    True
num_medications                   True
number_outpatient                 True
                                  ... 
metformin-rosiglitazone_Steady    True
metformin-pioglitazone_No         True
metformin-pioglitazone_Steady     True
change_No                         True
change_Yes                        True
Length: 92, dtype: bool

## Scaling the data

In [167]:
# Making a scaler object
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Verifying the scaling

In [168]:
# Verifying the mean of scaled_df (mean =0)
X_train_scaled.mean()

-3.49650725728912e-16

In [169]:
# Verifying the standard deviation (std = 1)
X_test_scaled.std(ddof=0)

1.012230708915198

## Model 1: Logistic regression

In [170]:
lg_model = LogisticRegression()
lg_model.fit(X_train_scaled, y_train)

LogisticRegression()

In [171]:
y_pred1 = lg_model.predict(X_test_scaled)

In [172]:
print(accuracy_score(y_pred1, y_test))

0.605992547598387


### Tuning the regularization strength (C-value)

In [174]:
Cs = [0.001, 0.1, 1, 10, 100]

# Looping c values
for c in Cs:
    lr_Cs = LogisticRegression(C=c, n_jobs=-1)
    lr_Cs.fit(X_train_scaled, y_train)
    y_pred_Cs = lr_Cs.predict(X_test_scaled)
    print('C value:', c, ' - average CV score: ', accuracy_score(y_pred_Cs, y_test))

C value: 0.001  - average CV score:  0.605277933745087
C value: 0.1  - average CV score:  0.605992547598387
C value: 1  - average CV score:  0.605992547598387
C value: 10  - average CV score:  0.605992547598387
C value: 100  - average CV score:  0.605992547598387


### Assessing the performance using cross validation

In [175]:
# CV=10
cv_lg_model = cross_validate(lg_model, X_train_scaled, y_train, cv=10)

In [176]:
cv_lg_model['test_score']

array([0.61068572, 0.59622256, 0.60762294, 0.60660201, 0.6062617 ,
       0.61374851, 0.60966479, 0.60864387, 0.60643185, 0.61783223])

## Model 2: KNearest Neighbors Classifier

In [177]:
# Initial model (k=3)
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
knn.fit(X_train_scaled, y_train)
print('Train score:', knn.score(X_train_scaled, y_train))
print('Test score:', knn.score(X_test_scaled, y_test))

Train score: 0.7706653054279394
Test score: 0.553621560920831


### Finding the right 'k' neighbors values

In [179]:
test_scores = []
train_scores = []

for k in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_train_scaled, y_train)
    
    train_scores.append(knn.score(X_train_scaled, y_train))
    test_scores.append(knn.score(X_test_scaled, y_test))

In [180]:
# Printing the scores according to k
knn_scores = pd.DataFrame({'train_scores':train_scores, 'test_scores':test_scores})
knn_scores['k'] = list(range(1,10))
knn_scores.set_index('k', inplace=True)
print(knn_scores)

   train_scores  test_scores
k                           
1      0.999813     0.543004
2      0.764233     0.551784
3      0.770665     0.553622
4      0.709920     0.559441
5      0.716369     0.556633
6      0.685979     0.563167
7      0.690522     0.566689
8      0.671941     0.566638
9      0.675957     0.565566


### Cross validation kNN

In [181]:
cv_knn = cross_validate(knn, X_train_scaled, y_train, cv=10)

In [183]:
print('CV-kNN, Test scores:', cv_knn['test_score'])

CV-kNN, Test scores: [0.56168113 0.56185128 0.56236175 0.57699507 0.54415518 0.58124894
 0.5746129  0.56678578 0.55946912 0.57291135]


In [184]:
y_pred = knn.predict(X_test_scaled)

In [185]:
cr_knn = classification_report(y_test, y_pred)
print(cr_knn)

              precision    recall  f1-score   support

          NO       0.57      0.64      0.61     10207
         YES       0.55      0.48      0.52      9384

    accuracy                           0.57     19591
   macro avg       0.56      0.56      0.56     19591
weighted avg       0.56      0.57      0.56     19591



### Grid Search CV - kNN

In [186]:
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_gridCV = GridSearchCV(knn, param_grid, cv=5)
knn_gridCV.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})

In [187]:
print('Best score:' + str(knn_gridCV.best_score_))
print('Best param:' + str(knn_gridCV.best_params_))

Best score:0.5780500255232262
Best param:{'n_neighbors': 49}
