In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize
import seaborn as sns
import matplotlib.pyplot as plt

In [50]:
df = pd.read_csv('heart.csv', sep=',')

In [51]:
df.columns = ["Age", "Sex", "Chest_Pain_Type", "Resting_Blood_Pressure", "Cholesterol", "Fasting_Blood_Sugar", "Resting_ECG", "Max_Heart_Rate", "Exercise_Induced_Angina", "Old_Peak", "Slope_Of_Peak_Exrcs", "Num_Major_Vessels", "Thalassemia", "Output"]

In [52]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Renaming columns to something readable

Check for wrong type and NULL:

In [53]:
# Check column types
df.dtypes

Age                          int64
Sex                          int64
Chest_Pain_Type              int64
Resting_Blood_Pressure       int64
Cholesterol                  int64
Fasting_Blood_Sugar          int64
Resting_ECG                  int64
Max_Heart_Rate               int64
Exercise_Induced_Angina      int64
Old_Peak                   float64
Slope_Of_Peak_Exrcs          int64
Num_Major_Vessels            int64
Thalassemia                  int64
Output                       int64
dtype: object

In [54]:
# Check for nulls
df.isnull().sum()

Age                        0
Sex                        0
Chest_Pain_Type            0
Resting_Blood_Pressure     0
Cholesterol                0
Fasting_Blood_Sugar        0
Resting_ECG                0
Max_Heart_Rate             0
Exercise_Induced_Angina    0
Old_Peak                   0
Slope_Of_Peak_Exrcs        0
Num_Major_Vessels          0
Thalassemia                0
Output                     0
dtype: int64

Data before pre-processing

In [55]:
df.hist(figsize=(30,15), bins = 20, layout=(3,5))

array([[<Axes: title={'center': 'Age'}>, <Axes: title={'center': 'Sex'}>,
        <Axes: title={'center': 'Chest_Pain_Type'}>,
        <Axes: title={'center': 'Resting_Blood_Pressure'}>,
        <Axes: title={'center': 'Cholesterol'}>],
       [<Axes: title={'center': 'Fasting_Blood_Sugar'}>,
        <Axes: title={'center': 'Resting_ECG'}>,
        <Axes: title={'center': 'Max_Heart_Rate'}>,
        <Axes: title={'center': 'Exercise_Induced_Angina'}>,
        <Axes: title={'center': 'Old_Peak'}>],
       [<Axes: title={'center': 'Slope_Of_Peak_Exrcs'}>,
        <Axes: title={'center': 'Num_Major_Vessels'}>,
        <Axes: title={'center': 'Thalassemia'}>,
        <Axes: title={'center': 'Output'}>, <Axes: >]], dtype=object)

Check for bad data:

In [56]:
# check for zeros
df[df == 0].count() 

Age                          0
Sex                         96
Chest_Pain_Type            143
Resting_Blood_Pressure       0
Cholesterol                  0
Fasting_Blood_Sugar        258
Resting_ECG                147
Max_Heart_Rate               0
Exercise_Induced_Angina    204
Old_Peak                    99
Slope_Of_Peak_Exrcs         21
Num_Major_Vessels          175
Thalassemia                  2
Output                     138
dtype: int64

In [57]:
# check for negatives
df[df<0].count()

Age                        0
Sex                        0
Chest_Pain_Type            0
Resting_Blood_Pressure     0
Cholesterol                0
Fasting_Blood_Sugar        0
Resting_ECG                0
Max_Heart_Rate             0
Exercise_Induced_Angina    0
Old_Peak                   0
Slope_Of_Peak_Exrcs        0
Num_Major_Vessels          0
Thalassemia                0
Output                     0
dtype: int64

In [58]:
# Check for duplicated rows:
len(df[df.duplicated()])

1

In [59]:
# Remove duplicated rows
df.drop_duplicates(inplace=True)

Impute missing values:

In [60]:
# no missing data for this dataset

In [61]:
Y = df.loc[:, df.columns == 'Output']
X = df.loc[:, df.columns != 'Output']

One Hot Encoding Categorical columns

In [62]:
# Categorical columns: 
# Sex
# Chest_Pain_Type 
# Fasting_Blood_Sugar
# Resting_ECG 
# Exercise_Induced_Angina
# Slope_Of_Peak_Exrcs
# Num_Major_Vessels
# Thalassemia
cols_to_encode = [
                    "Sex",
                    "Chest_Pain_Type",
                    "Fasting_Blood_Sugar",
                    "Resting_ECG",
                    "Exercise_Induced_Angina",
                    "Slope_Of_Peak_Exrcs",
                    "Num_Major_Vessels",
                    "Thalassemia"
                ]
dummies = pd.get_dummies(X, columns = cols_to_encode)
X = X.drop(cols_to_encode, axis=1)
X = pd.concat([X, dummies], axis=1)


In [63]:
len(X.columns)

35

In [64]:
# check skew of data
X.skew()

Age                          -0.203743
Resting_Blood_Pressure        0.716541
Cholesterol                   1.147332
Max_Heart_Rate               -0.532671
Old_Peak                      1.266173
Age                          -0.203743
Resting_Blood_Pressure        0.716541
Cholesterol                   1.147332
Max_Heart_Rate               -0.532671
Old_Peak                      1.266173
Sex_0                         0.786120
Sex_1                        -0.786120
Chest_Pain_Type_0             0.106640
Chest_Pain_Type_1             1.808554
Chest_Pain_Type_2             0.958590
Chest_Pain_Type_3             3.211733
Fasting_Blood_Sugar_0        -1.981201
Fasting_Blood_Sugar_1         1.981201
Resting_ECG_0                 0.053264
Resting_ECG_1                 0.000000
Resting_ECG_2                 8.558047
Exercise_Induced_Angina_0    -0.737281
Exercise_Induced_Angina_1     0.737281
Slope_Of_Peak_Exrcs_0         3.401543
Slope_Of_Peak_Exrcs_1         0.146814
Slope_Of_Peak_Exrcs_2    

In [65]:
X.kurtosis()

Age                           -0.527512
Resting_Blood_Pressure         0.922996
Cholesterol                    4.542591
Max_Heart_Rate                -0.062186
Old_Peak                       1.567876
Age                           -0.527512
Resting_Blood_Pressure         0.922996
Cholesterol                    4.542591
Max_Heart_Rate                -0.062186
Old_Peak                       1.567876
Sex_0                         -1.391273
Sex_1                         -1.391273
Chest_Pain_Type_0             -2.001930
Chest_Pain_Type_1              1.279297
Chest_Pain_Type_2             -1.088358
Chest_Pain_Type_3              8.370620
Fasting_Blood_Sugar_0          1.937947
Fasting_Blood_Sugar_1          1.937947
Resting_ECG_0                 -2.010522
Resting_ECG_1                 -2.013378
Resting_ECG_2                 71.715069
Exercise_Induced_Angina_0     -1.466170
Exercise_Induced_Angina_1     -1.466170
Slope_Of_Peak_Exrcs_0          9.634257
Slope_Of_Peak_Exrcs_1         -1.991680


Standardizing data

In [66]:
columns = X.columns
X = pd.DataFrame(StandardScaler().fit_transform(X))
X.columns = columns

Checking for outliers:

In [67]:
boxplot = X.boxplot() 
plt.xticks(rotation=90);

Winsorizing outliers

In [68]:
def winsorize_series(X):
    return winsorize(X, limits=[0.01,0.02])

X_win = X.apply(winsorize_series, axis=0)
boxplot = X_win.boxplot() 
plt.xticks(rotation=90);


In [69]:
X_win.hist(figsize=(30,15), bins = 20, layout=(30,50));

In [70]:
plt.rcParams['font.size'] = 14

f = sns.violinplot(data=X_win)

f.set_xticklabels(f.get_xticklabels(),
                  rotation=90)

plt.title("Violin plot of features",
          size=16)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

Check for data imbalance

In [None]:
print(Y.value_counts())

Output
1         164
0         138
Name: count, dtype: int64


In [None]:
corr_matrix = X_win.corr()
sns.set (rc = {'figure.figsize':(9, 8)})
sns.heatmap(corr_matrix, annot=True)
plt.show()

  plt.show()


PCA

In [72]:
from sklearn.decomposition import PCA
pca = PCA(n_components=13)
X_new = pca.fit_transform(X_win)

# plt.style.use('ggplot')
# colors = {1:'tab:blue', 0:'tab:orange'}
# fig, axes = plt.subplots(1,1)
# axes.scatter(X_new[:,0], X_new[:,1], c=Y['Output'].map(colors))
# axes.set_xlabel('PC1')
# axes.set_ylabel('PC2')
# axes.set_title('After PCA')
# plt.show()

Model Training and testing

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_win, Y, test_size=0.20, random_state=42, shuffle=True)

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline

names = [
    "Logistic Regression",
    "SVC",
    "Random Forest",
    "Multi-Layer Perceptron",
    "GaussianNB",
    "XGB",
    "LightGBM",
    "CatBoost",
]

classifiers = [
    LogisticRegression(max_iter=1000),
    SVC(kernel="linear", C=0.025),
    RandomForestClassifier(n_estimators=25, max_depth=8, max_features="log2"),
    MLPClassifier(alpha=1, max_iter=1000),
    GaussianNB(var_smoothing=0.0107),
    XGBClassifier(),
    LGBMClassifier(num_leaves=8 ),
    CatBoostClassifier(verbose=0, n_estimators=100, max_depth=7)#, iterations=400),
]

for name, clf in zip(names, classifiers):
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train.values.ravel())
        score = clf.score(X_test, y_test.values.ravel())
        print(name, ": ", score)

Logistic Regression :  0.8852459016393442
SVC :  0.8360655737704918
Random Forest :  0.8360655737704918
Multi-Layer Perceptron :  0.8360655737704918
GaussianNB :  0.8688524590163934
XGB :  0.8524590163934426
[LightGBM] [Info] Number of positive: 132, number of negative: 109
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 470
[LightGBM] [Info] Number of data points in the train set: 241, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547718 -> initscore=0.191454
[LightGBM] [Info] Start training from score 0.191454
LightGBM :  0.819672131147541
CatBoost :  0.8524590163934426


In [76]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# Create the random grid
random_grid = {
                "max_depth": [3, 5,8,10,12], 
                'n_estimators': [25,35,50,70,75,100,125,150,175],
                "max_features": [3,"sqrt","log2"], 
                "min_samples_split": [2,3], 
                "bootstrap": [True, False], 
                "criterion": ["gini", "entropy","log_loss"],
                } 

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
print(rf_random.best_params_)
print(rf_random.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  return fit_method(estimator, *args, **kwargs)


{'n_estimators': 150, 'min_samples_split': 3, 'max_features': 3, 'max_depth': 5, 'criterion': 'gini', 'bootstrap': True}
0.8297839506172839


In [35]:
gs_RF.best_params_

{'max_features': 'log2', 'n_estimators': 100}

In [34]:
from sklearn.model_selection import GridSearchCV
classifier = RandomForestClassifier()
params_RF = {
             'max_features': ["sqrt","log2"], # most important
             'n_estimators': [25,35,50,70,75,100,125,150,175] # most important
             }
gs_RF = GridSearchCV(estimator=classifier, 
                 param_grid=params_RF, 
                 #cv=cv_method,   # use any cross validation technique 
                 verbose=1, 
                 scoring='accuracy') 
gs_RF.fit(X_train, y_train)

gs_RF.best_params_


Fitting 5 folds for each of 18 candidates, totalling 90 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

0.8381802721088436

In [None]:
from sklearn.model_selection import GridSearchCV
classifier = CatBoostClassifier()

params_NB = {#'iterations': [350,375,400,425,450], # default is 500
             'depth': [7,8,9], # default is 6
             'n_estimators': [30,35,40] # default 1
             }

#{'max_depth': [3,4,5],'n_estimators':[100, 200, 300]}

gs_NB = GridSearchCV(estimator=classifier, 
                 param_grid=params_NB, 
                 #cv=cv_method,   # use any cross validation technique 
                 verbose=1, 
                 scoring='accuracy') 
gs_NB.fit(X_train, y_train)

gs_NB.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Learning rate set to 0.110152
0:	learn: 0.6680621	total: 2.03ms	remaining: 69ms
1:	learn: 0.6321051	total: 2.78ms	remaining: 45.9ms
2:	learn: 0.6082426	total: 3.83ms	remaining: 40.9ms
3:	learn: 0.5895321	total: 4.98ms	remaining: 38.6ms
4:	learn: 0.5760545	total: 6.24ms	remaining: 37.4ms
5:	learn: 0.5578591	total: 7.49ms	remaining: 36.2ms
6:	learn: 0.5423860	total: 7.83ms	remaining: 31.3ms
7:	learn: 0.5297960	total: 8.9ms	remaining: 30ms
8:	learn: 0.5151559	total: 9.83ms	remaining: 28.4ms
9:	learn: 0.5011637	total: 10.8ms	remaining: 27ms
10:	learn: 0.4875775	total: 11.7ms	remaining: 25.4ms
11:	learn: 0.4757611	total: 12.5ms	remaining: 23.9ms
12:	learn: 0.4639202	total: 13.3ms	remaining: 22.6ms
13:	learn: 0.4547838	total: 14.3ms	remaining: 21.4ms
14:	learn: 0.4440850	total: 15.2ms	remaining: 20.2ms
15:	learn: 0.4365940	total: 16.5ms	remaining: 19.6ms
16:	learn: 0.4270858	total: 17.8ms	remaining: 18.9ms
17:	learn: 0.4184771	tota

{'depth': 8, 'n_estimators': 35}