In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries required

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Reading data

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')
data.head()

# EDA

In [None]:
data.columns = data.columns.str.lower()

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
heart_prob = data[data['heartdisease'] == 1]
plt.figure(figsize=(12,8))
s = sns.countplot(heart_prob.sex)
for p in s.patches:
    s.annotate(format(p.get_height(), '.1f'), 
               (p.get_x() + p.get_width() / 2., p.get_height()), 
                xytext = (0, 9), 
                textcoords = 'offset points'
              )
plt.show()

Here we come to know that there are more number of males than the females who are prone to heart failures.

In [None]:
plt.figure(figsize=(12,8))
plt.hist(heart_prob.age, histtype='step', color='black')
plt.xlabel('age')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.show()

By above graph, we get to know that most of heart risk is in between the age 50 to 60

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(heart_prob.age)
plt.show()

Age feature is asymptotically gaussian.

# Data Preparation

In [None]:
data['sex'].replace({'M': 1, 'F': 0}, inplace=True)
data.chestpaintype.replace({'ASY': 0, 'NAP': 1, 'ATA': 2, 'TA': 3}, inplace=True)
data.restingecg.replace({'Normal': 0, 'LVH': 1, 'ST': 2}, inplace=True)
data['exerciseangina'].replace({'Y': 1, 'N': 0}, inplace=True)
data.st_slope.replace({'Flat': 0, 'Up': 1, 'Down': 2}, inplace=True)
data.head()

In [None]:
plt.title('Distribution plots of all features')
for cols in data.columns:
    sns.distplot(data[cols])
    plt.show()

All columns are asymptotically guassian

In [None]:
X = data.drop('heartdisease', axis=1)
Y = data['heartdisease']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=999, test_size=0.1)

# Model Training and validation

In [None]:
models = [("Logistic Regression", LogisticRegression(random_state=0, max_iter=1000)),
          ("Support vectors", SVC(random_state=0)),
          ("Random Forest", RandomForestClassifier(random_state=0)),
          ("Decision Trees", DecisionTreeClassifier(random_state=0)),
          ("XGBoost", xgboost.XGBClassifier(random_state=0)),
          ('Gradient Boosting', GradientBoostingClassifier(random_state=0))
         ]

In [None]:
results = []
names=[]
finalresults=[]

for name, model in models:
    model.fit(X_train, Y_train)
    model_results = model.predict(X_test)
    score= accuracy_score(Y_test, model_results)
    results.append(score)
    names.append(name)
    finalresults.append((name,score))

In [None]:
# Visualising the accuracy score of each classification model
plt.rcParams['figure.figsize']=15,8 
plt.style.use('dark_background')
ax = sns.barplot(x=names, y=results, palette = "rocket", saturation =1.0)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 13, horizontalalignment = 'center', rotation = 0)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
#     print(width)
#     print(height)
plt.show()

In [None]:
index = results.index(max(results))
print(f"{names[index]} has the highest accuracy of {max(results)}")