In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import skew, kurtosis
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# Data Processing libraries
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Importing models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
# No Null Values?
data.isnull().sum()

In [None]:
# Print out columns that are not necessarily discrete
for col in data.select_dtypes(np.number).columns:
    if data[col].nunique() > 5:
        print(col, data[col].nunique())

In [None]:
# Print out column names
print(data.columns.tolist())

In [None]:
# Save the discrete values (values lesser than 5 count as being discrete)
disc_vars = 'sex cp fbs restecg exng slp caa thall output'.split()
disc_vars

In [None]:
# Check the distributions of each discrete variable
fig, axes = plt.subplots(3,3,figsize=(14,14))
for var,ax in zip(disc_vars,axes.flat):
    sns.countplot(data=data,x=var,hue='output',ax=ax)

Some observations:
- Samples with a 'thall' value of 2 are VERY likely to be positive instances
- Samples with an 'exng' value of 1 are more likely to be positive instances
- People with 'restecg' value 1 are slightly more likely to be at risk
- Samples with an 'slp' value of 2 are very likely to be positive instances
- Samples with a 'caa' value of 0 are VERY likely to be positive instances
- It seems that females (sex=0) are more likely to be positive instances
- fbs really is not a useful feature
- Those with a 'cp' value of 2 are more at risk
- Our target class has a good distribution of different classes

In [None]:
# Capture the 'less discrete' features
cont_vars = [var for var in data.columns if var not in disc_vars]
print(len(cont_vars))
cont_vars

In [None]:
fig, axes = plt.subplots(3,2,figsize=(14,14))
for var,ax in zip(cont_vars, axes.flat):
    sns.distplot(data[var],ax=ax,bins=45)

Apart from 'oldpeak' everything else has a balanced distribution. This special feature is skewed so we have to normalize it.

In [None]:
# Fixing the positive skewness with log transform
data['oldpeak'] = np.log1p(data['oldpeak'])

In [None]:
sns.distplot(data['oldpeak'])

In [None]:
# Checking for Categorical Variables
data.select_dtypes(exclude=np.number).columns.tolist()

So there are no Null Values, no Categorical Variables. Recall we still have some variables that take on discrete values so we should One-Hot Encode them. On top of this we should scale our data.

In [None]:
# Make a copy so we can see the difference with cross validation if scaling, encoding made a difference
data_copy = data.copy()

# Get rid of the target values so we can use a pipeline to transform the features
data_copy.drop('output',axis=1,inplace=True)
y = data['output']

In [None]:
# Make a pipeline for scaling and transforming our data
disc = [i for i in disc_vars if i != "output"]
pipeline = ColumnTransformer([
    ("numeric",StandardScaler(),cont_vars),
    ("discrete",OneHotEncoder(),disc)
])

In [None]:
# Fit the Transformations
data_copy = pipeline.fit_transform(data_copy)

In [None]:
# Test whether the transformations made a difference using cross validation and a simple LR model
model = LogisticRegression()
print("Scores with transformed dataset:")
print(cross_val_score(model,data_copy,y,cv=10).mean())
model2 = LogisticRegression()
print("Scores with plain dataset:")
print(cross_val_score(model2,data.drop('output',axis=1),y,cv=10).mean())

So our transformed dataset looks to feed better to models, though not by much. Perhaps this is made more apparent using more powerful models like RandomForests.

In [None]:
model = RandomForestClassifier()
print("Scores with transformed dataset:")
print(cross_val_score(model,data_copy,y,cv=10).mean())
model2 = RandomForestClassifier()
print("Scores with plain dataset:")
print(cross_val_score(model2,data.drop('output',axis=1),y,cv=10).mean())

Again it does not look like much of an improvement but we shall stick with it and train an ensemble.

In [None]:
from sklearn.model_selection import train_test_split

data = data_copy
X_train,X_test,y_train,y_test = train_test_split(data,y,test_size=0.2,random_state=42)

In [None]:
# Make our models and train them
# Lets add in LinearSVC too
from sklearn.svm import LinearSVC
rf = RandomForestClassifier(n_estimators=120,random_state=42)
svm = SVC(max_iter=100)
mlp = MLPClassifier(random_state=42)
log_reg = LogisticRegression(random_state=42)
knn = KNeighborsClassifier()
ada = AdaBoostClassifier()
lsvc = LinearSVC(max_iter=100, tol=20, random_state=42)

estimators = [rf,svm,mlp,log_reg,knn,ada,lsvc]

In [None]:
for estimator in estimators:
    print("Training the ",estimator)
    estimator.fit(X_train,y_train)
    print(estimator.score(X_test,y_test))
print("Done")

KNN worked very well here- for our ensemble we shall only keep the four best estimators.

In [None]:
named_estimators = [
    ("rf",rf),("svm",svm),("log_reg",log_reg),("knn",knn)
]
voting_classifier = VotingClassifier(named_estimators)

print("Training...")
voting_classifier.fit(X_train,y_train)
print("Done.\n Score: ")
# See how it scores
print(voting_classifier.score(X_test,y_test))

The Ensemble was disappointing. The KNN Classifier won out in the end with an accuracy of $90.16$%

In [None]:
from sklearn.metrics import confusion_matrix
print("Confusion Matrix for the Hard Voting Classifier:")
print(confusion_matrix(voting_classifier.predict(X_test),y_test))
print("Confusion Matrix for the KNN Classifier:")
print(confusion_matrix(knn.predict(X_test),y_test))

The Confusion Matrices are not dissimilar so we can safely say that the KNN Classifier is the victor.