In [None]:
import numpy as np
import pandas as pd
from datetime import datetime as dt

import plotly
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import iplot
import cufflinks as cf

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
cf.set_config_file(world_readable=True, theme='space', offline=True)

data = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
data

# Exploratory Data Analysis

In [None]:
pres = data.copy()

pres.loc[pres.sex == 0, 'sex'] = 'Female'
pres.loc[pres.sex == 1, 'sex'] = 'Male'

pres.loc[pres.cp == 0, 'cp'] = 'Asymptomatic'
pres.loc[pres.cp == 1, 'cp'] = 'Typical Angina'
pres.loc[pres.cp == 2, 'cp'] = 'Atypical Angina'
pres.loc[pres.cp == 3, 'cp'] = 'Non-anginal pain'

pres.rename(columns={'cp': 'Chest Pain Type', 'age': 'Age', 'sex': 'Sex'}, inplace=True)

## Gender-Age Distribution

In [None]:
gend = pres.loc[:, ['Sex', 'output']].groupby('Sex').count()
gend['+'] = pres.loc[:, ['Sex', 'output']].groupby('Sex').sum()
gend['output'] -= gend['+']
gend.rename(columns={'output': '-'}, inplace=True)
gend.iplot(kind='bar')

iplot(
    px.histogram(data_frame=pres, x='Age', y='output', color='Sex', barmode='overlay',
                 color_discrete_sequence=['DodgerBlue', 'FireBrick'],
                 marginal='violin', opacity=0.6, template='plotly_dark',
                 labels={'output': 'diagnoses'})
)

***Conclusion:*** *Women are more likely to have a heart attack.*

## Chest Pain Type Distributions

In [None]:
cp_data = pres.groupby('Chest Pain Type').count()[['output']]
cp_data['+'] = pres.groupby('Chest Pain Type').sum()[['output']]
cp_data.rename(columns={'output': '-'}, inplace=True)
cp_data['-'] -= cp_data['+']
cp_data.sort_values(by='+', ascending=False, inplace=True)
cp_data.iplot(kind='bar', title='Chest Pain Type')

***Conclusions:***  
1) *asymptomatic pain type indicates a predisposition to heart attack with the least likelihood*  
2) *atypical angina is the most common chest pain type in people prone to heart attack*

In [None]:
iplot(px.violin(pres[pres.output == 1], x='Chest Pain Type', y='Age', color='Sex', template='plotly_dark', box=True,
               title='Distribution Of Pain Types By Age In Men and Women With Attacks'))

***Note:***  *women age distribution for non-anginal pain is specific (sample feature?)*

## Other categorical features

In [None]:
categ_feats = ['fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']
titles = ['Fasting blood sugar',
          'Resting electrocardiographic results',
          'Exercise induced angina',
          'The slope of the peak exercise ST segment',
          'Number of major vessels colored by flourosopy',
          'Thal']
values = [{0: '<= 120 mg/dl', 1: '> 120 mg/dl'},
          {0: 'hypertrophy', 1: 'normal', 2: 'having ST-T wave abnormality'},
          {0: 'no', 1: 'yes'},
          {2: 'upsloping', 1: 'flat', 0: 'downsloping'}, {},
          {2: 'normal', 1: 'fixed defect', 3: 'reversable defect'}]
categ_fig = make_subplots(2, 3);

for f, t, v in zip(categ_feats, titles, values):
    f_data = pres.groupby(f).count()[['output']]
    f_data['+'] = pres.groupby(f).sum()[['output']]
    f_data.rename(columns={'output': '-'}, inplace=True)
    f_data['-'] -= f_data['+']
    f_data.rename(index = v, inplace=True)
    iplot(px.bar(f_data, barmode='group', opacity=0.6, title=t, labels={f: '', 'value':''},
                 color_discrete_sequence=['DarkOrange', 'DodgerBlue'], template='plotly_dark'))

***Conclusion:*** *list of characteristics more common in people prone to heart attack*:
1. *Resting electrocardiographic results: normal*
2. *No exercise induced angina*
3. *The slope of the peak exercise ST segment: upsloping*
4. *0 major vessels colored by flourosopy*
5. *Thal: normal*

## Numerical features

In [None]:
num_feats = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
fig = px.scatter_matrix(data, dimensions=num_feats, color='output', template='plotly_dark')
fig.update_layout(
    dragmode='select',
    width=1000,
    height=600,
    hovermode='closest'
)
iplot(fig)
for f in num_feats:
    iplot(px.histogram(data[[f, 'output']], color='output', barmode='overlay',
                 color_discrete_sequence=['DodgerBlue', 'Yellow'], labels={'value': f},
                 marginal='box', opacity=0.6, template='plotly_dark'))

***Conclusions:** list of features' values more common in people prone to heart attack:*
1. *Age: below 55*
2. *Thalach: above 150*
3. *Oldpeak: 0-0.7, 1.3-1.7*

# Model Selection

In [None]:
import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
from sklearn import preprocessing
from sklearn import metrics

X = data.drop(columns=['output'])
y = data['output']

X = preprocessing.normalize(preprocessing.scale(X))
models_stats = pd.DataFrame(index=["Neg logloss", "Mean accuracy"])

cv_train = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

def build_model(est, params):
    bp = GridSearchCV(estimator=est, param_grid=params, cv=cv_train, n_jobs=8, verbose=False).fit(X, y).best_params_
    print(bp)
    return est.__class__(**bp)

def test_model(model, model_name):
    scores = cross_validate(
        model, X, y, cv=10,
        scoring=['neg_log_loss', 'accuracy']
    )
    nll, acc = scores['test_neg_log_loss'], scores['test_accuracy']
    pd.DataFrame(scores)[
        ['test_neg_log_loss', 'test_accuracy']
    ].iplot(title=model_name)
    models_stats[model_name] = [round(nll.mean() * 2, 1) / 2, round(acc.mean() * 2, 2) / 2]

In [None]:
# Support Vector Machines
svm = build_model(
    SVC(probability=True),
    {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'tol': [1e-4], 'probability': [True]}
)

In [None]:
# Decision Tree
decision_tree = build_model(
    DecisionTreeClassifier(random_state=0),
    {'min_samples_leaf': range(2,10), 'random_state': [1, 2, 3]}
)

In [None]:
# Random Forest
random_forest = build_model(
    RandomForestClassifier(),
    {'n_estimators': [300, 400, 500], 'min_samples_split': [60],
    'min_samples_leaf': [25, 30, 35], 'max_depth': [5, 6, 7],
    'criterion': ['gini'], 'bootstrap': [False], 'random_state': [1, 2, 3]}
)

In [None]:
# Extra Trees
extra_trees = build_model(
    ExtraTreesClassifier(random_state=0),
    {'min_samples_leaf' : range(10, 60, 10), 'random_state': [1, 2, 3]}
)

In [None]:
# Light GBM
light_gbm = build_model(
    LGBMClassifier(),
    {'n_estimators': [500, 1000, 5000],
    'max_depth': range(4, 9),
    'learning_rate': [0.005, 0.01, 0.05]}
)

In [None]:
# CatBoost
catboost = build_model(
    CatBoostClassifier(verbose=False),
    {'n_estimators': [200, 300, 400], 
    'learning_rate': [0.003, 0.005, 0.01],
    'max_depth': [4, 5, 6], 'verbose': [False]}
)

In [None]:
# Gaussian Naive Bayes
naive_bayes = build_model(
    GaussianNB(),
    {'var_smoothing': [1e-8, 1e-9, 1e-10]}
)

In [None]:
# k-Nearest Neighbors
kNN = build_model(
    KNeighborsClassifier(),
    {'n_neighbors': [7, 10, 13, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3]}
)

In [None]:
# Gaussian Process
gauss_proc = build_model(
    GaussianProcessClassifier(),
    {'max_iter_predict': [100, 200],
    'warm_start': [True, False],
    'n_restarts_optimizer': range(3),
    'random_state': [1, 2, 3]}
)

In [None]:
models = {
    "Support Vector Machines": svm,
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "Extra Trees Classifier": extra_trees,
    "LightGBM": light_gbm,
    "CatBoost": catboost,
    "Gaussian Naive Bayes": naive_bayes,
    "k-Nearest Neighbors": kNN,
    "Gaussian Process": gauss_proc
}

for model_name, model in models.items():
    test_model(model, model_name)

models_stats = models_stats.T.sort_values(by=['Neg logloss', 'Mean accuracy'], ascending=False)
models_stats.iplot(kind="bar")

### ***Best Models*** 
Random Forest (85%, 0.45)  
Gaussian Process (84%, 0.4)  
Support Vector Machines (83.5%, 0.4)

# Final Model

In [None]:
from sklearn.metrics import accuracy_score
final_model = random_forest
final_model.fit(X, y)

def classify(input_data):
    return final_model.predict(input_data)

print(f"{int(round(accuracy_score(classify(X), y), 2) * 100)}%")