## Building the Logistic Regression Model
## John Sokol
### 4/18/2018

### References: [Official Tableau Blog](https://www.tableau.com/about/blog/2017/1/building-advanced-analytics-applications-tabpy-64916), [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Heart+Disease) 

In [1]:
cd /Users/johnsokol/Desktop/Health\ Disease\ ML\ Project/Datasets/

/Users/johnsokol/Desktop/Health Disease ML Project/Datasets


In [2]:
# Data manipulation and quantitative libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

# sci-kit functions for data processing and logisitic regression
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics, svm, ensemble

# keras functions to build neural network
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import regularizers 
from keras.layers import BatchNormalization, Dense, Flatten, InputLayer
from keras.models import load_model

# logistic regression functions
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold 
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV

# TabPy client for function deployment
import tabpy_client

import warnings
warnings.filterwarnings("ignore")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Import the data from the Cleveland and Long Beach UCI datasets: 
Click [here](http://archive.ics.uci.edu/ml/datasets/Heart+Disease) for the link to the repository. 

In [3]:
cleveland = pd.read_csv('cleveland_clean_4.csv', sep = ',', header = 0)
longbeach = pd.read_csv('long-beach-va_storage_1.csv', sep = ',', header = 2)

# concatenate the two datasets into one master DataFrame
master_df = pd.concat([cleveland, longbeach])

Store pertinent features in separate DataFrame to build the neural network:

In [4]:
nn_attr = master_df.loc[:,['age','sex','resting_blood_pressure','cholesterol','cigarettes_per_day','years_as_smoker',
'fasting_blood_sugar','hist_heart_dis','resting_hr','max_hr_ach','mets', 'tpeakbps', 'exer_ind_angina', 'rldv5e']]

# view summary statistics for outliers to ensure data is clean
nn_attr.describe()

Unnamed: 0,age,sex,resting_blood_pressure,cholesterol,cigarettes_per_day,years_as_smoker,fasting_blood_sugar,hist_heart_dis,resting_hr,max_hr_ach,mets,tpeakbps,exer_ind_angina,rldv5e
count,375.0,375.0,375.0,375.0,375.0,375.0,375.0,375.0,375.0,375.0,375.0,375.0,375.0,375.0
mean,55.749333,0.744,132.584,246.674667,17.050667,16.730667,0.197333,0.589333,73.933333,143.002667,8.8576,166.592,0.418667,96.810667
std,9.113139,0.437005,17.93997,51.385101,18.39234,16.044873,0.398518,0.492612,13.678639,25.771051,3.195734,23.77516,0.494,61.925727
min,29.0,0.0,94.0,100.0,0.0,0.0,0.0,0.0,37.0,69.0,2.0,84.0,0.0,4.0
25%,50.0,0.0,120.0,213.0,0.0,0.0,0.0,0.0,64.0,125.0,6.95,151.0,0.0,29.5
50%,57.0,1.0,130.0,240.0,20.0,20.0,0.0,1.0,73.0,147.0,9.0,165.0,0.0,103.0
75%,62.0,1.0,142.0,274.0,30.0,30.0,0.0,1.0,83.0,162.0,11.0,182.0,1.0,140.0
max,77.0,1.0,200.0,564.0,99.0,54.0,1.0,1.0,139.0,202.0,18.0,232.0,1.0,270.0


In [5]:
nn_attr.shape

(375, 14)

Scale and preprocess the data:

In [6]:
scaler = preprocessing.StandardScaler().fit(nn_attr)
nn_attr_scaled = scaler.transform(nn_attr)

target = master_df[['DIAGNOSIS']].reset_index(drop = True).replace([2,3,4], value = [1,1,1])

X_train, X_test, y_train, y_test = train_test_split(
nn_attr_scaled, target, train_size = 0.75, random_state = 42)

y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [7]:
y_test.head()

Unnamed: 0,DIAGNOSIS
167,0
33,0
15,0
316,0
57,1


In [8]:
y_test_cat[0:5,]

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.]])

Compare predicted diagnosis to labeled diagnosis for test dataset:

In [9]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

# Required in Jupyter Notebook to show plots in line
%matplotlib inline

def conf_matrix(y_test, y_test_predict, classes, title='Confusion Matrix', out=None):
    """ Creates a Confusion Matrix comparing the output test data vs. predicted
        output test data. 

        Inputs:
        y_test: Actual output test data
        y_test_predict: Predicted output test data
        classes: List of valid classes
        title: Title of the plot [Default = 'Confusion Matrix']
        out: Path to save the plot to [Default = None => Only show it]
    """
    # Converts both output arrays into just one column based on the class
    y_test_predict_class = y_test_predict.argmax(1)
    y_test_class = y_test.argmax(1)

    # Creates confusion matrix
    cm_data = confusion_matrix(y_test_class, y_test_predict_class)
    np.set_printoptions(precision=2)

    # Plots Confusion Matrix
    plt.figure()
    plt.imshow(cm_data, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.xlabel('Predicted Label')
    plt.yticks(tick_marks, classes)
    plt.ylabel('True Label')

    # Plots data on chart
    thresh = cm_data.max() / 2.
    for i, j in itertools.product(range(cm_data.shape[0]), range(cm_data.shape[1])):
        plt.text(j, i, format(cm_data[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm_data[i, j] > thresh else "black")

    plt.tight_layout()

    # Saves or Shows Plot
    if out:
        plt.savefig(out)
    else:
        plt.show()

In [10]:
# List of valid classes
classes = {0: 'No Presence',
           1: 'Presence'}
classes_list = list(classes.values())

In [11]:
# conf_matrix(y_test_cat, pred, classes_list, title='Heart Disease NN Confusion Matrix')

Logistic Regression: K-folds cross validation.
K-folds cross validation should increase model error out accuracy considering the model uses _all_ observations for model training. The observations are randomly divided into train and test datasets, the model is fitted, then the model is randomly divided again, fitted, and so on, K times with resampling. So all observations are used as training data to fit the model. To determine final accuracy output, all of the model accuracies are averaged for a cumulative accuracy score. This model uses 10 fold cross validation. 

In [12]:
# establish K-folds cross validation
kf = StratifiedKFold(np.reshape(target.values, [375,]), n_folds = 10, random_state = None, shuffle = True)

In [13]:
# Logistic regression w/ 10 fold stratified cross-validation using model specific cross-validation in scikit-learn
lgcv = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))),penalty = 'l2',scoring = 'roc_auc',cv = kf)
lgcv.fit(nn_attr_scaled, target)
y_pred = lgcv.predict(nn_attr_scaled)

# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
print(classification_report(target, y_pred))

# Show accuracy and area under ROC curve
print("Accuracy: %0.3f" % accuracy_score(target, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(target, y_pred))

Classification report:
             precision    recall  f1-score   support

          0       0.76      0.80      0.78       178
          1       0.81      0.77      0.79       197

avg / total       0.78      0.78      0.78       375

Accuracy: 0.781
Aucroc: 0.782


In [14]:
lgcv.coef_

array([[ 0.2350051 ,  0.67490836,  0.36173709,  0.22880568, -0.06421661,
         0.08939087, -0.04642936,  0.10581774, -0.02301696, -0.38792221,
        -0.14987582, -0.19335718,  0.61012418, -0.05274703]])

In [15]:
coef_df = pd.DataFrame({'Features':nn_attr.columns, 'Coefficients':lgcv.coef_.ravel(),})
coef_df

Unnamed: 0,Coefficients,Features
0,0.235005,age
1,0.674908,sex
2,0.361737,resting_blood_pressure
3,0.228806,cholesterol
4,-0.064217,cigarettes_per_day
5,0.089391,years_as_smoker
6,-0.046429,fasting_blood_sugar
7,0.105818,hist_heart_dis
8,-0.023017,resting_hr
9,-0.387922,max_hr_ach


It looks like the logistic regression CV model has about a 5% greater prediction accuracy than the neural network. So logistic regression CV will be deployed to TabPy instead of the neural network.

In [16]:
# logisitic regression cross validation: binary response, at risk or no risk
def suggest_diag_binary(age, sex, resting_blood_pressure, cholesterol, cigarettes_per_day, 
years_as_smoker, fasting_blood_sugar, hist_heart_dis, resting_hr, max_hr_ach, mets, tpeakbps, exer_ind_angina, rldv5e):
    
    features = np.column_stack([age, sex, resting_blood_pressure, cholesterol, cigarettes_per_day, 
    years_as_smoker, fasting_blood_sugar, hist_heart_dis, resting_hr, max_hr_ach, mets, tpeakbps, exer_ind_angina, rldv5e])
    
    features_scaled = scaler.transform(features)
    prediction = lgcv.predict(features_scaled)[0].tolist()
    
    pred_str = []
    if (prediction == 1): 
        pred_str.append("At Risk")
    else: 
        pred_str.append("No Risk")
    
    return(pred_str)

suggest_diag_binary(18, 0, 120, 150, 0, 0, 0,0, 65, 200, 15, 140, 0, 93)

['No Risk']

In [17]:
# logistic regression cross validation: probability, 0 - 100%
def suggest_diag_prob(age, sex, resting_blood_pressure, cholesterol, cigarettes_per_day, 
years_as_smoker, fasting_blood_sugar, hist_heart_dis, resting_hr, max_hr_ach, mets, tpeakbps, exer_ind_angina, rldv5e):
    
    features = np.column_stack([age, sex, resting_blood_pressure, cholesterol, cigarettes_per_day, 
    years_as_smoker, fasting_blood_sugar, hist_heart_dis, resting_hr, max_hr_ach, mets, tpeakbps, exer_ind_angina, rldv5e])
    
    features_scaled = scaler.transform(features)
    predict_proba = lgcv.predict_proba(features_scaled)[0,1].tolist()
    
    return(predict_proba)
suggest_diag_prob(18, 0, 120, 150, 0, 0, 0,0, 65, 200, 15, 140, 0, 93)

0.01450475316957184

Connect to TabPy:

In [18]:
# Connect to TabPy server using the client library
connection = tabpy_client.Client('http://localhost:9004')

# Publish the suggest_diag_prob function to TabPy server so it can be used from Tableau
connection.deploy('heart_disease_logregcv_prob',
                  suggest_diag_prob, override = True)

# The function below was deployed initially to transmit binary result of logistic regression, however, to enhance 
# performance, only the probability is now deployed. Instead, a Tableau calculated field is used to translate
# 'At Risk' when probability >= 0.50 and 'No Risk' when probability < 0.50. 

# connection.deploy('heart_disease_logregcv',
# suggest_diag_binary, override = True)

ConnectionError: HTTPConnectionPool(host='localhost', port=9004): Max retries exceeded with url: /endpoints (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1a1dd6b128>: Failed to establish a new connection: [Errno 61] Connection refused',))