We will load in the processed dataset from the previous notebook. 

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# dislpay full column widths and all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
proc_df = pd.read_csv('/media/veracrypt3/Cloud/Datasets/Kaggle/heart_processed.csv')

In [4]:
proc_df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,LVH,Normal,ST,N,Y,Down,Flat,Up
0,40,140,289.0,0,172,0.0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180.0,0,156,1.0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283.0,0,98,0.0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214.0,0,108,1.5,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195.0,0,122,0.0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


# Create X and Y datasets

In [5]:
X = np.asarray(proc_df.loc[:, proc_df.columns != 'HeartDisease'])  # select all columns except 'HeartDisease'
X[0:5]

array([[ 40. , 140. , 289. ,   0. , 172. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 49. , 160. , 180. ,   0. , 156. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          1. ,   0. ],
       [ 37. , 130. , 283. ,   0. ,  98. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 48. , 138. , 214. ,   0. , 108. ,   1.5,   1. ,   0. ,   1. ,
          0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ],
       [ 54. , 150. , 195. ,   0. , 122. ,   0. ,   0. ,   1. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ]])

In [6]:
Y = np.asarray(proc_df['HeartDisease'])
Y[0:5]

array([0, 1, 0, 1, 0])

In [7]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.43220634,  0.41462669,  0.94076249, -0.55173333,  1.38333943,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.47805725,  1.52635965, -0.99871403, -0.55173333,  0.75473573,
         0.10625149,  1.9368261 , -1.9368261 , -1.08542493, -0.48221041,
         1.88138352, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724,  0.99891008, -0.86988791],
       [-1.75025603, -0.14123979,  0.83400232, -0.55173333, -1.52395266,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627, -1.22697371,  2.0375685 ,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.58407381,  0.30345339, -0.3937397 , -0.55173333, -1.13107535,
         0.57512835,  1.9368261 , -1.9368261 ,  

# Train/test split dataset

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)
print('Train set:', X_train.shape, Y_train.shape)
print('Test set:', X_test.shape, Y_test.shape)

Train set: (733, 20) (733,)
Test set: (184, 20) (184,)


In [9]:
LR = LogisticRegression(C=0.1, solver='liblinear').fit(X_train, Y_train)

In [10]:
yhat = LR.predict(X_test)
yhat

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1])

In [14]:
yhat_prob = LR.predict_proba(X_test)
yhat_prob

array([[0.40374797, 0.59625203],
       [0.05930455, 0.94069545],
       [0.78707569, 0.21292431],
       [0.29459984, 0.70540016],
       [0.97380357, 0.02619643],
       [0.05084847, 0.94915153],
       [0.05662423, 0.94337577],
       [0.14324629, 0.85675371],
       [0.41103367, 0.58896633],
       [0.90902911, 0.09097089],
       [0.97472772, 0.02527228],
       [0.0257794 , 0.9742206 ],
       [0.58702534, 0.41297466],
       [0.94406069, 0.05593931],
       [0.00932095, 0.99067905],
       [0.77469372, 0.22530628],
       [0.98338647, 0.01661353],
       [0.76155855, 0.23844145],
       [0.95763811, 0.04236189],
       [0.06007153, 0.93992847],
       [0.94428863, 0.05571137],
       [0.05602015, 0.94397985],
       [0.10630286, 0.89369714],
       [0.98924075, 0.01075925],
       [0.96017899, 0.03982101],
       [0.07996723, 0.92003277],
       [0.98761422, 0.01238578],
       [0.3473298 , 0.6526702 ],
       [0.05247398, 0.94752602],
       [0.01581716, 0.98418284],
       [0.

# Evaluation

In [12]:
print(classification_report(Y_test, yhat))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84        83
           1       0.86      0.89      0.87       101

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



It looks like our model has an 86% accuracy, which is quite decent. But, it can likely also be improved upon.

# Further tuning

Let's try dropping some columns and see if we can improve the accuracy at all.

In [15]:
proc_df2 = proc_df

In [17]:
proc_df2 = proc_df2.drop('Cholesterol', axis=1)

In [19]:
proc_df2.head()

Unnamed: 0,Age,RestingBP,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,LVH,Normal,ST,N,Y,Down,Flat,Up
0,40,140,0,172,0.0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,0,156,1.0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,0,98,0.0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,0,108,1.5,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,0,122,0.0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


In [21]:
X2 = np.asarray(proc_df2.loc[:, proc_df2.columns != 'HeartDisease'])
X2[0:5]

array([[ 40. , 140. ,   0. , 172. ,   0. ,   0. ,   1. ,   0. ,   1. ,
          0. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,   0. ,
          1. ],
       [ 49. , 160. ,   0. , 156. ,   1. ,   1. ,   0. ,   0. ,   0. ,
          1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,   1. ,
          0. ],
       [ 37. , 130. ,   0. ,  98. ,   0. ,   0. ,   1. ,   0. ,   1. ,
          0. ,   0. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,   0. ,
          1. ],
       [ 48. , 138. ,   0. , 108. ,   1.5,   1. ,   0. ,   1. ,   0. ,
          0. ,   0. ,   0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,
          0. ],
       [ 54. , 150. ,   0. , 122. ,   0. ,   0. ,   1. ,   0. ,   0. ,
          1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,   0. ,
          1. ]])

In [23]:
Y2 = np.asarray(proc_df['HeartDisease'])
Y2[0:5]

array([0, 1, 0, 1, 0])

In [24]:
X2 = preprocessing.StandardScaler().fit(X2).transform(X2)
X2[0:5]

array([[-1.43220634,  0.41462669, -0.55173333,  1.38333943, -0.83150225,
        -0.51630861,  0.51630861, -1.08542493,  2.07378351, -0.53152374,
        -0.22981048, -0.50782627,  0.81501339, -0.49078105,  0.82431012,
        -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.47805725,  1.52635965, -0.55173333,  0.75473573,  0.10625149,
         1.9368261 , -1.9368261 , -1.08542493, -0.48221041,  1.88138352,
        -0.22981048, -0.50782627,  0.81501339, -0.49078105,  0.82431012,
        -0.82431012, -0.27160724,  0.99891008, -0.86988791],
       [-1.75025603, -0.14123979, -0.55173333, -1.52395266, -0.83150225,
        -0.51630861,  0.51630861, -1.08542493,  2.07378351, -0.53152374,
        -0.22981048, -0.50782627, -1.22697371,  2.0375685 ,  0.82431012,
        -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.58407381,  0.30345339, -0.55173333, -1.13107535,  0.57512835,
         1.9368261 , -1.9368261 ,  0.92129817, -0.48221041, -0.53152374,
        -0.229

In [25]:
# train/test split
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=0.2, random_state=4)
print('Train set:', X2_train.shape, Y2_train.shape)
print('Test set:', X2_test.shape, Y2_test.shape)

Train set: (733, 19) (733,)
Test set: (184, 19) (184,)


In [34]:
#model training
LR2 = LogisticRegression(C=0.1, solver='liblinear').fit(X2_train, Y2_train)

In [35]:
yhat2 = LR2.predict(X2_test)
yhat2

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1])

In [36]:
yhat_prob2 = LR2.predict_proba(X2_test)
yhat_prob2

array([[0.40547187, 0.59452813],
       [0.05869471, 0.94130529],
       [0.78905921, 0.21094079],
       [0.29764035, 0.70235965],
       [0.97368761, 0.02631239],
       [0.05127108, 0.94872892],
       [0.0570911 , 0.9429089 ],
       [0.14153043, 0.85846957],
       [0.4014083 , 0.5985917 ],
       [0.91047134, 0.08952866],
       [0.97469984, 0.02530016],
       [0.02601204, 0.97398796],
       [0.5927716 , 0.4072284 ],
       [0.9448021 , 0.0551979 ],
       [0.00920409, 0.99079591],
       [0.77869335, 0.22130665],
       [0.9839799 , 0.0160201 ],
       [0.7266483 , 0.2733517 ],
       [0.95621791, 0.04378209],
       [0.05923724, 0.94076276],
       [0.94219571, 0.05780429],
       [0.05534481, 0.94465519],
       [0.10692862, 0.89307138],
       [0.98954274, 0.01045726],
       [0.95839869, 0.04160131],
       [0.0783829 , 0.9216171 ],
       [0.98763651, 0.01236349],
       [0.3491428 , 0.6508572 ],
       [0.05363831, 0.94636169],
       [0.01538677, 0.98461323],
       [0.

In [37]:
print(classification_report(Y2_test, yhat2))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84        83
           1       0.86      0.89      0.87       101

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



No improvements from dropping cholesterol.