In [20]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# dislpay full column widths and all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
proc_df = pd.read_csv('/media/veracrypt3/Cloud/Datasets/Kaggle/heart_processed.csv')

In [4]:
proc_df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,LVH,Normal,ST,N,Y,Down,Flat,Up
0,40,140,289.0,0,172,0.0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180.0,0,156,1.0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283.0,0,98,0.0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214.0,0,108,1.5,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195.0,0,122,0.0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


# Create X and Y datasets

In [5]:
X = np.asarray(proc_df.loc[:, proc_df.columns != 'HeartDisease'])
X[0:5]

array([[ 40. , 140. , 289. ,   0. , 172. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 49. , 160. , 180. ,   0. , 156. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          1. ,   0. ],
       [ 37. , 130. , 283. ,   0. ,  98. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 48. , 138. , 214. ,   0. , 108. ,   1.5,   1. ,   0. ,   1. ,
          0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ],
       [ 54. , 150. , 195. ,   0. , 122. ,   0. ,   0. ,   1. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ]])

In [6]:
Y = np.asarray(proc_df['HeartDisease'])
Y[0:5]

array([0, 1, 0, 1, 0])

In [7]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.43220634,  0.41462669,  0.94076249, -0.55173333,  1.38333943,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.47805725,  1.52635965, -0.99871403, -0.55173333,  0.75473573,
         0.10625149,  1.9368261 , -1.9368261 , -1.08542493, -0.48221041,
         1.88138352, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724,  0.99891008, -0.86988791],
       [-1.75025603, -0.14123979,  0.83400232, -0.55173333, -1.52395266,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627, -1.22697371,  2.0375685 ,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.58407381,  0.30345339, -0.3937397 , -0.55173333, -1.13107535,
         0.57512835,  1.9368261 , -1.9368261 ,  

# Train/test split dataset

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)
print('Train set:', X_train.shape, Y_train.shape)
print('Test set:', X_test.shape, Y_test.shape)

Train set: (733, 20) (733,)
Test set: (184, 20) (184,)


### Grid search logistic regression.

In [18]:
parameters ={'C':[0.01,0.1,1],
             'penalty':['l2'],
             'solver':['liblinear']}

lr = LogisticRegression()
grid_lr = GridSearchCV(lr, parameters, scoring='accuracy', cv=10)
logreg_cv = grid_lr.fit(X_train, Y_train)

In [19]:
print('tuned hyperparameters:', logreg_cv.best_params_)
print('accuracy:', logreg_cv.best_score_)

tuned hyperparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
accuracy: 0.8608108108108109


### Grid search support vector machine.

In [22]:
parameters_svm = {'kernel':('linear', 'rbf', 'poly', 'rbf', 'sigmoid'),
             'C': np.logspace(-3, 3, 5),
             'gamma': np.logspace(-3, 3, 5)}

svm = SVC()

In [24]:
grid_svm = GridSearchCV(svm, parameters_svm, scoring='accuracy', cv=10)
svm_cv = grid_svm.fit(X_train, Y_train)

In [25]:
print("tuned hpyerparameters:", svm_cv.best_params_)
print("accuracy:", svm_cv.best_score_)

tuned hpyerparameters: {'C': 1.0, 'gamma': 0.001, 'kernel': 'rbf'}
accuracy: 0.8663087745279526
