In [17]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# dislpay full column widths and all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [8]:
proc_df = pd.read_csv('/media/veracrypt3/Cloud/Datasets/Kaggle/heart_processed.csv')

In [9]:
proc_df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,LVH,Normal,ST,N,Y,Down,Flat,Up
0,40,140,289.0,0,172,0.0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180.0,0,156,1.0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283.0,0,98,0.0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214.0,0,108,1.5,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195.0,0,122,0.0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1


# Create X and Y datasets

In [10]:
X = np.asarray(proc_df.loc[:, proc_df.columns != 'HeartDisease'])
X[0:5]

array([[ 40. , 140. , 289. ,   0. , 172. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 49. , 160. , 180. ,   0. , 156. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          1. ,   0. ],
       [ 37. , 130. , 283. ,   0. ,  98. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ,   0. ,   0. ,   0. ,   1. ,   1. ,   0. ,   0. ,
          0. ,   1. ],
       [ 48. , 138. , 214. ,   0. , 108. ,   1.5,   1. ,   0. ,   1. ,
          0. ,   0. ,   0. ,   0. ,   1. ,   0. ,   0. ,   1. ,   0. ,
          1. ,   0. ],
       [ 54. , 150. , 195. ,   0. , 122. ,   0. ,   0. ,   1. ,   0. ,
          0. ,   1. ,   0. ,   0. ,   1. ,   0. ,   1. ,   0. ,   0. ,
          0. ,   1. ]])

In [11]:
Y = np.asarray(proc_df['HeartDisease'])
Y[0:5]

array([0, 1, 0, 1, 0])

In [12]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.43220634,  0.41462669,  0.94076249, -0.55173333,  1.38333943,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.47805725,  1.52635965, -0.99871403, -0.55173333,  0.75473573,
         0.10625149,  1.9368261 , -1.9368261 , -1.08542493, -0.48221041,
         1.88138352, -0.22981048, -0.50782627,  0.81501339, -0.49078105,
         0.82431012, -0.82431012, -0.27160724,  0.99891008, -0.86988791],
       [-1.75025603, -0.14123979,  0.83400232, -0.55173333, -1.52395266,
        -0.83150225, -0.51630861,  0.51630861, -1.08542493,  2.07378351,
        -0.53152374, -0.22981048, -0.50782627, -1.22697371,  2.0375685 ,
         0.82431012, -0.82431012, -0.27160724, -1.00109111,  1.14957339],
       [-0.58407381,  0.30345339, -0.3937397 , -0.55173333, -1.13107535,
         0.57512835,  1.9368261 , -1.9368261 ,  

# Train/test split dataset

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=4)
print('Train set:', X_train.shape, Y_train.shape)
print('Test set:', X_test.shape, Y_test.shape)

Train set: (733, 20) (733,)
Test set: (184, 20) (184,)


In [18]:
# Create first pipeline for base without reducing features.

pipe = Pipeline([('classifier' , RandomForestClassifier())])

# Create param grid.

param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


 0.4461094  0.85267915 0.4461094  0.85676079 0.80905787 0.85949119
 0.82540304 0.86084242 0.85673283 0.85674215 0.85675147 0.85400242
 0.8553816  0.85400242 0.85673283 0.85264188 0.85264188 0.85264188
 0.85264188 0.85264188 0.85264188 0.85264188 0.85264188 0.85264188
 0.85264188 0.85264188 0.85264188 0.85264188 0.85264188 0.85264188
 0.85264188 0.85264188 0.85264188 0.85264188 0.82543099 0.83087317
 0.85132793 0.84178548 0.83907371 0.84727425 0.85134657 0.85133725
 0.8499767  0.85135588 0.81586991 0.8363433  0.83225235 0.83636194
 0.84591371 0.83769453 0.84045289 0.84728357 0.84590439 0.83498276
 0.82679154 0.83771317 0.83091045 0.84041562 0.84178548 0.8363433
 0.83499208 0.84041562 0.83771317 0.83088249        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        