In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A Silent Killer?

Heart disease is the leading cause of death across ethnicities with one person succumbing to it every **36 seconds** in the US. Over half a million Americans die of heart disease each year - almost **1 in every 4** deaths - with costs of health services, care and productivity loss amounting to over **200M$** each year.

More specifically, **1 in every 5** heart attacks is silent - with the damage being done and the person impacted is unaware of it. Early action is critical and knowing the signs and symptoms of a heart attack could help. It would be even more beneficial if we can effectively predict the possibility of a heart attack and take preventive action.

Heart attack (a.k.a *myocardial infraction*) happens when a part of the heart muscle doesn't get enough blood. And the more time passes without restoration of blood flow, the greater the damage to the heart. Coronary artery disease (**CAD**) is the major cause for heart attacks and is caused by plaque build-up in the arterial walls and other parts of the body. Plaque is made up of deposits of cholesterol and other substances. Plaque build up causes the arteries to narrow and over time, this can block (partially or wholly) blood flow.

Risk factors include age, family history and lifestyle choices. While age and family history can't be controlled, appropriate lifestyle choices can help mitigate the incidence of heart attacks.

# About the data set
This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to this date.The "target" field refers to the presence of heart disease in the patient. 

**Attribute Information**
1. Age
2. Sex - 1 : male, 0: female
3. CP - Chest pain type, 0 : typical angina,1 : atypical angina, 2 : non-anginal pain, 3: asymptomatic
4. trestbps - Resting BP
5. chol - Serum Cholesterol (mg/dl)
6. fbs - Fasting blood glucose (>120 mg/dl), 1: true, 0: false
7. restecg -Resting ECG results, 0: normal,1 : having ST-T, 2: hypertrophy
8. thalach - Maximum heart rate achieved
9. exang - Exercise induced angina, 1: yes, 0: no
10. OldPeak - ST depression induced by exercise relative to rest
11. Slope - slope of the peak exercise ST segment, 0: upsloping, 1: flat, 2: downsloping
12. ca - Number of major vessels colored by fluroscopy
13. Thal - 0 : normal, 1 : fixed defect, 2 : reversible defect
14. Target - 0 : less chance of heart attack, 1 : more chance of attack

Dataset is taken for learning purpose. Source of the data : https://archive.ics.uci.edu/ml/datasets/Heart+Disease

# Knowing the data



In [None]:
import numpy as np
import pandas as pd
import pandas_profiling as pp

In [None]:
dat = pd.read_csv('../input/health-care-data-set-on-heart-attack-possibility/heart.csv')
dat.describe()

In [None]:
pp.ProfileReport(dat)

# Preprocessing the data

In [None]:
# drop duplicate rows
df = dat.drop_duplicates()
df.describe()

In [None]:
# define target and predictors
# np.random.seed(12345)
# df = df.reindex(np.random.permutation(df.index))
y = df['target']
X = df.drop(['target'],axis=1)

print(X.info())

# Modeling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, classification_report
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, plot_precision_recall_curve
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

**Build Functions**

In [None]:
def model_fit_summarize(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y = model.predict(X_test)

    confmat = confusion_matrix(y_test,y)
    acc = accuracy_score(y_test, y)*100
    pre = precision_score(y_test, y)*100
    rec = recall_score(y_test, y)*100
    roc_auc = roc_auc_score(y_test, y)*100

#     print('Confusion Matrix :\n',confmat)
    print('Accuracy score : {:.2f} %'.format(acc))
    print('Precision score : {:.2f} %'.format(pre))
    print('Recall score : {:.2f} %'.format(rec))
    print('ROC_AUC score : {:.2f} %'.format(roc_auc))
    
    print('\nClassification report :\n', classification_report(y_test, y))

    plot_roc_curve(model, X_test, y_test)
    plot_precision_recall_curve(model, X_test, y_test)
    plot_confusion_matrix(model, X_test, y_test)
    
    return acc, pre, rec, roc_auc

**Logistic Regression**

In [None]:
lr = LogisticRegression(solver='liblinear')
a,b,c,d = model_fit_summarize(lr, X_train, y_train, X_test, y_test)
summary = pd.DataFrame([a,b,c,d],index=['Accuracy','Precision','Recall','ROC_AUC'],columns=['LogisticRegression'])

**Decision Tree**

In [None]:
dt = DecisionTreeClassifier(random_state=0)
a,b,c,d, = model_fit_summarize(dt,X_train, y_train, X_test, y_test)
summary['DecisionTree'] = [a,b,c,d]

**Random Forest**

In [None]:
rf = RandomForestClassifier(random_state=101)
a,b,c,d = model_fit_summarize(rf,X_train, y_train, X_test, y_test)
summary['RandomForest']=[a,b,c,d]

**XGBoost**

In [None]:
xg = XGBClassifier(random_state=123, n_estimators=25, learning_rate=0.01)
a,b,c,d = model_fit_summarize(xg,X_train, y_train, X_test, y_test)
summary['XGBoost'] = [a,b,c,d]

**Summary**

In [None]:
pd.set_option('precision',1)
print(summary)