# Intro
Welcome to the [Heart Attack Analysis & Prediction Dataset](https://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset)

![](https://storage.googleapis.com/kaggle-datasets-images/1226038/2046696/2465e7cd117a6954befa50eff39d236f/dataset-cover.jpg)

<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries 

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/heart-attack-analysis-prediction-dataset/'
os.listdir(path)

# Functions

In [None]:
def plot_hist(data, feature, bins=10):
    fig, axs = plt.subplots(1, 2, figsize=(14, 3), sharey=True, sharex=True)
    axs[0].hist(data[data['Target']==0][feature], bins=bins, color='yellowgreen', alpha=0.7)
    axs[1].hist(data[data['Target']==1][feature], bins=bins, color='tomato', alpha=0.7)
    axs[0].set_title('Less chance of heart attack')
    axs[1].set_title('More chance of heart attack')
    axs[0].grid()
    axs[1].grid()
    
def plot_bar(data, feature):
    fig, axs = plt.subplots(1, 2, figsize=(14, 3), sharey=True, sharex=True)
    names = np.arange(len(set(data[feature].unique())))
    axs[0].bar(names, data[data['Target']==0][feature].value_counts(), color='yellowgreen', alpha=0.7)
    axs[1].bar(names, data[data['Target']==1][feature].value_counts(), color='tomato', alpha=0.7)
    
    axs[0].set_xticks(names)
    axs[1].set_xticks(names)
    
    axs[0].set_title('Less chance of heart attack')
    axs[1].set_title('More chance of heart attack')
    axs[0].grid()
    axs[1].grid()

# Load Data

In [None]:
data = pd.read_csv(path+'heart.csv')

# Overview

In [None]:
print('Number of samples: ', len(data.index))
print('Number of features: ', len(data.columns))

In [None]:
data.head()

**About this dataset**

* age : Age of the patient
* sex : Sex of the patient
* cp : [Chest Pain](https://en.wikipedia.org/wiki/Chest_pain) type chest pain type
    * Value 1: typical angina
    * Value 2: atypical angina
    * Value 3: non-anginal pain
    * Value 4: asymptomatic
* trtbps : resting [blood pressure](https://en.wikipedia.org/wiki/Blood_pressure)  (in mm Hg)
* chol : [cholestoral](https://en.wikipedia.org/wiki/Cholesterol) in mg/dl fetched via BMI sensor
* fbs : (fasting [blood sugar](https://en.wikipedia.org/wiki/Blood_sugar_level) > 120 mg/dl) (1 = true; 0 = false)
* restecg : resting [electrocardiographic](https://en.wikipedia.org/wiki/Electrocardiography) results
    * Value 0: normal
    * Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    * Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach : maximum [heart rate](https://en.wikipedia.org/wiki/Heart_rate) achieved
* exng: exercise induced angina (1 = yes; 0 = no)
* oldpeak: previous peak
* slp: slope
* caa: number of major [vessels](https://en.wikipedia.org/wiki/Blood_vessel) (0-3)
* thall: thale rate
* output 
     * Value 0: less chance of heart attack
     * Value 1: more chance of heart attack

In [None]:
dict_names = {'age': 'Age', 'sex': 'Sex', 'cp': 'Chest_Pain', 'trtbps': 'Blood_Pressure', 'chol': 'Cholestoral',
              'fbs': 'Blood_Sugar', 'restecg': 'ECG_Results', 'thalachh': 'Heart_Rate', 'exng': 'Angina',
              'oldpeak': 'Previous_Peak', 'slp': 'Slope', 'caa': 'Vessels', 'thall': 'Thale_Rate', 'output': 'Target'}

In [None]:
data.rename(columns=dict_names, inplace=True)

# Exploratory Data Analysis

In [None]:
plot_hist(data, 'Age', bins=20)

In [None]:
plot_bar(data, 'Sex')

In [None]:
plot_hist(data, 'Heart_Rate', bins=20)

# Train And Test Data

In [None]:
X = data[data.columns.difference(['Target'])]
y = data['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2021)

print('Train samples:', len(X_train))
print('Test samples:', len(X_test))

# Simple Model

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Test Score:', accuracy_score(y_test, y_pred, normalize=True))

In [None]:
importance = model.feature_importances_
fig = plt.figure(figsize=(10, 6))
x = X_train.columns.values
plt.barh(x, 100*importance)
plt.title('Feature Importance', loc='left')
plt.xlabel('Percentage')
plt.grid()
plt.show()

# Grid Search

In [None]:
param_grid = {'criterion': ['gini'],
              'max_features': [None, 'auto', 'sqrt', 'log2'],
              'max_depth': [i for i in range(1, 6)],
              'class_weight': [None, 'balanced'],
              'min_samples_split': [2, 4, 6, 8, 10 ,12],
              'min_samples_leaf': [1, 2, 3, 4],
              'random_state': [2021]}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
best_params = grid.best_params_
print('Best score of cross validation: {:.3f}'.format(grid.best_score_))
print('Best parameters:', best_params)

In [None]:
model = DecisionTreeClassifier()
model.set_params(**best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred, normalize=True))

In [None]:
importance = model.feature_importances_
fig = plt.figure(figsize=(10, 6))
x = X_train.columns.values
plt.barh(x, 100*importance)
plt.title('Feature Importance', loc='left')
plt.xlabel('Percentage')
plt.grid()
plt.show()