In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, plot_roc_curve
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import plotly.express as px
sns.set_theme(style='darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df =pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe

In [None]:
df.dtypes

Let's perform auto EDA using Pandas Profiling as the data set isn't that big.

In [None]:
pip install openpyxl

In [None]:
pp.ProfileReport(df)

Pandas Profiling has provided us some quick insights. 
Luckily, there are no missing values. 

According to sex let's manually figure out the percentage of male/ female affected by heart attack.

In [None]:
# check for how many womens are prone to heart-attack
women_stroke = df.loc[df.sex == 0]['output']
women_stroke_percentage = sum(women_stroke)/len(women_stroke)
print('The % of women prone to heart-attack: {}%'.format(women_stroke_percentage*100))

**Correlation**

In [None]:
def find_correlational_map(data):
    plt.figure(figsize=(16,12))
    sns.heatmap(data.corr(), annot=True, cmap='OrRd')
    plt.title('Correlational Map', weight='bold')
    print('---'*50)
    print(data.corr().output.sort_values(ascending = False))
    plt.tight_layout()
    
find_correlational_map(df)

**Data Splitting**

In [None]:
df.head()

In [None]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [None]:
# Let's split the date into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

**Model Building**

In [None]:
# let's create a pipline 
pipeline = make_pipeline(RobustScaler()) # creating pipeline for model building

LR = make_pipeline(pipeline, LogisticRegression(random_state=0)) # LogisticRegression pipeline
DT = make_pipeline(pipeline, DecisionTreeClassifier(random_state=0)) # DecisionTree Classifier pipeline
RF = make_pipeline(pipeline, RandomForestClassifier(random_state=0)) # RandomForest Classifier pipeline
AC = make_pipeline(pipeline, AdaBoostClassifier(random_state=0)) # Adaboost Classifier pipeline
NB = make_pipeline(pipeline, GaussianNB()) # Naive bayes pipeline
KN = make_pipeline(pipeline, KNeighborsClassifier()) # KNeighbor pipeline
SV = make_pipeline(pipeline, SVC(random_state=0)) # Support vector pipeline

In [None]:
# creating model_dict
model_dictionary = {
    'Logistic_Regression':LR,
    'DecisionTree_Classifier':DT,
    'RandomForest_classifier':RF,
    'Adaboost_Classifier':AC,
    'Naivebayes_Classifier':NB,
    'KNeighbors_classifier':KN,
    'Support_Vector':SV
}

In [None]:
print(model_dictionary)

In [None]:
# define a function to fit the model and return it's accuracy, classification report and confusion matrix
def model_fitting(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('The accuracy score of the model is: {}%'.format(accuracy_score(y_test, y_pred)* 100))
    print('-----'*20)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

**Selecting the best model**

In [None]:
for name, model in model_dictionary.items():
    print('---'*10)
    print(name)
    model_fitting(model)

As we can see:

Ada boost has got 90% accuracy with only 6 misclassified classes.
It' has a precision of 0.86 for classes 0 and 0.94 for classes 1, which is better than all other algorithms.
**Let's use Adaboost Model**

In [None]:
model = AdaBoostClassifier(random_state=0)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
def find_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, cmap='OrRd')
    plt.title('Confusion Matrix', weight='bold')
    print(classification_report(y_test, y_pred))
    plot_roc_curve(model, X_test, y_test)
    
    
find_confusion_matrix(y_test, y_pred)

In [None]:
print('The accuracy of the model is: {}%'.format(round(accuracy_score(y_test, y_pred)*100, 2)))