# Modeling

In [115]:
#import libraries
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score,
                             recall_score, f1_score)
from sklearn.metrics import (auc, roc_curve, precision_recall_curve,
                             plot_precision_recall_curve, average_precision_score)
from sklearn.metrics import precision_recall_fscore_support

from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.pipeline import Pipeline

import os
import warnings
warnings.filterwarnings('ignore')

In [116]:
model_path = '../models/CVD_prediction_model.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
else:
    print("Expected model not found")

In [117]:
model

Pipeline(steps=[('selectkbest', SelectKBest(k=5)), ('ros', RandomOverSampler()),
                ('LR', LogisticRegression(C=0.001))])

In [118]:
X_train = pd.read_csv('../data/X_train.csv')
y_train = pd.read_csv('../data/y_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_test = pd.read_csv('../data/y_test.csv')
y_test = np.array(y_test).ravel()
y_train = np.array(y_train).ravel()

In [129]:
model.fit(X_train, y_train)

y_pred_final = model.predict(X_test)

y_predprob_final = model.predict_proba(X_test)[:,1]

print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred_final))
print('')
print('Precision: %.4f' %(precision_score(y_test, y_pred_final)))
print('Recall: %.4f' %(recall_score(y_test, y_pred_final)))
print('')
precision, recall, thresholds = precision_recall_curve(y_test, y_predprob_final)

print('Area under PRC: %.4f'%(auc(recall, precision)))

fp, tp, th = roc_curve(y_test, y_pred_final)
print('Area under ROC: %.4f'%(auc(fp, tp)))


Confusion Matrix: 
 [[1831  745]
 [  68  254]]

Precision: 0.2543
Recall: 0.7888

Area under PRC: 0.3362
Area under ROC: 0.7498


In [122]:
X_train_feat = X_train[['age', 'family_history_cat', 'diabetes_cat', 'kidney_cat', 'smoking_cat']]
X_test_feat = X_test[['age', 'family_history_cat', 'diabetes_cat', 'kidney_cat', 'smoking_cat']]

model.fit(X_train_feat, y_train)

y_pred_final_feat = model.predict(X_test_feat)

y_predprob_final_feat = model.predict_proba(X_test_feat)[:,1]

print("\nConfusion Matrix: \n", confusion_matrix(y_test, y_pred_final_feat))
print('')
print('Precision: %.4f' %(precision_score(y_test, y_pred_final_feat)))
print('Recall: %.4f' %(recall_score(y_test, y_pred_final_feat)))
print('')
precision, recall, thresholds = precision_recall_curve(y_test, y_predprob_final_feat)

print('Area under PRC: %.4f'%(auc(recall, precision)))

fp, tp, th = roc_curve(y_test, y_pred_final_feat)
print('Area under ROC: %.4f'%(auc(fp, tp)))


Confusion Matrix: 
 [[1855  721]
 [  57  265]]

Precision: 0.2688
Recall: 0.8230

Area under PRC: 0.3615
Area under ROC: 0.7715


In [130]:
import ipywidgets as widgets
import ipywidgets
from IPython.display import clear_output

In [131]:
style = {'description_width':'350px'} 
layout = {'width': '450px'}

gender = widgets.Dropdown(
       options=['male', 'female'],
       value='male',
       description="What's your gender?", style=style, layout=layout)

age = widgets.IntText(
    value=35,
    description="What's your age?", style=style, layout=layout,
    disabled=False)

fam_history = widgets.Dropdown(
       options=['yes', 'no', "don't know"],
       value='yes',
       description='Do you have a family member/relative with heart disease?', style=style, layout=layout)

diabetes = widgets.Dropdown(
       options=['yes', 'no', 'borderline', "don't know"],
       value='yes',
       description='Do you have diabetes?', style=style, layout=layout)

kidney = widgets.Dropdown(
       options=['yes', 'no'],
       value='yes',
       description='Have you had kidney failure?', style=style, layout=layout)

smoking = widgets.Dropdown(
       options=['yes', 'no'],
       value='yes',
       description='Do you smoke or have smoked in the past?', style=style, layout=layout)

In [132]:
box = widgets.VBox([age, fam_history, diabetes, kidney, smoking])
display(box)
button = widgets.Button(description='Get my heart disease risk!', icon='fa-heartbeat', layout={'width': '200px'})
display(button)
out = widgets.Output()
display(out)

def onButtonClick(b):
    inp = pd.DataFrame(columns=['age', 'family_history_cat', 'diabetes_cat', 'kidney_cat', 'smoking_cat'])
    inp_df = inp.append({'gender': gender.value, 'age': age.value, 'family_history_cat': fam_history.value,
           'diabetes_cat': diabetes.value, 'kidney_cat': kidney.value, 'smoking_cat':smoking.value}, ignore_index=True)
    to_scale = 'age'
    not_to_scale = ['family_history_cat', 'diabetes_cat', 'kidney_cat', 'smoking_cat']
    scaler = StandardScaler()
    scaled_val = pd.DataFrame(scaler.fit_transform(np.array(X_tr[to_scale]).reshape(-1,1)))
    scaled_val = pd.DataFrame(scaler.transform(np.array(inp_df[to_scale]).reshape(-1,1)),
                          columns=['age']).reset_index(drop=True)
    risk_input=pd.concat([scaled_val, inp_df[not_to_scale].reset_index(drop=True)], axis=1)
    risk_input['family_history_cat'].replace(['yes','no', "don't know"],[1, 0, 0],inplace=True)
    risk_input['diabetes_cat'].replace(['yes','no', 'borderline', "don't know"], [1, 0, 1, 0],inplace=True)
    risk_input['kidney_cat'].replace(['yes','no'], [1, 0],inplace=True)
    risk_input['smoking_cat'].replace(['yes','no'],[1, 0],inplace=True)
    
    steps = [('rus',RandomUnderSampler(random_state=42)), ('LR', LogisticRegression(random_state=42))]
    pipeline = Pipeline(steps)
    params = [{'LR__C':[.001,.01,.1,1,10,100]}]
    lrus_cv = GridSearchCV(pipeline, params, cv=5)
    X_train_2 = X_train[['age', 'family_history_cat', 'diabetes_cat', 'kidney_cat', 'smoking_cat']]
    X_val_2 = risk_input
    lrus_cv.fit(X_train_2, y_train)
    y_pred = lrus_cv.predict(X_val_2)
    y_predprob = lrus_cv.predict_proba(X_val_2)[:,1]
    
    with out:
        clear_output(True)
        print('You have a {}% chance of having a cardiovascular disease.'.format(int(y_predprob*100)))

button.on_click(onButtonClick)

VBox(children=(IntText(value=35, description="What's your age?", layout=Layout(width='450px'), style=Descripti…

Button(description='Get my heart disease risk!', icon='heartbeat', layout=Layout(width='200px'), style=ButtonS…

Output()