In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# 1. Importing exploratory libraries 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 2. Take a look at the data set

In [None]:
df=pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df.info()

In [None]:
df.head()

### 2.1 Meaning of the columns 
1. Age : Age of the patient

2. Sex : Sex of the patient

3. exang: exercise induced angina (1 = yes; 0 = no)

4. ca: number of major vessels (0-3)

5. cp : Chest Pain type chest pain type

    - Value 1: typical angina

    - Value 2: atypical angina

    - Value 3: non-anginal pain

    - Value 4: asymptomatic

6. trtbps : resting blood pressure (in mm Hg)

7. chol : cholestoral in mg/dl fetched via BMI sensor

8. fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

9. rest_ecg : resting electrocardiographic results

    - Value 0: normal

    - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

    - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

10. thalach : maximum heart rate achieved

11. target : 0= less chance of heart attack 1= more chance of heart attack

### 2.2 Lets see which features are continuous or discrete

In [None]:
df.nunique()

# 3. Exploring the data

### 3.1 For binary classification, it is important to check if the data is skewed 

In [None]:
fig,ax=plt.subplots(figsize=(5,4))
sns.barplot(x=df['output'].unique(),y=df['output'].value_counts()/303,palette=["C3", "C9"])
ax.set_xlabel("Heart attack likelihood")
ax.set_ylabel('Proportion of patients')
sns.set_theme(style="white")

#### We see that our data is not skewed

### 3.2 We look at each feature now, starting with continuous features. We analyze their significance using stacked histogram with kernel density estimation and joint plot. 

In [None]:
fig, ax = plt.subplots(5, 1,figsize=(8,20))
plt.subplot(5,1,1)
sns.histplot(data=df, x="age", hue="output", multiple="stack",palette=["C9", "C3"],kde=True)
#ax[0].set_xlabel("Age")
plt.subplot(5,1,2)
sns.histplot(data=df, x="trtbps", hue="output",bins=14, multiple="stack",palette=["C9", "C3"],kde=True)
#ax[1].set_xlabel("Resting blood pressure")
plt.subplot(5,1,3)
sns.histplot(data=df, x="chol", hue="output", multiple="stack",bins=16,palette=["C9", "C3"],kde=True)
#ax[2].set_xlabel("Cholestoral")
plt.subplot(5,1,4)
sns.histplot(data=df, x="thalachh", hue="output", multiple="stack",palette=["C9", "C3"],kde=True)
#ax[3].set_xlabel("Maximum heart rate")
plt.subplot(5,1,5)
sns.histplot(data=df, x="oldpeak", hue="output", multiple="stack",palette=["C9", "C3"],kde=True)

#### There are three features that strongly correlate with the output: age of the patient,  maximum heart rate and old peak.  We can thus draw three conclusions. First, people who are younger than 50 have much higher risk of getting a heart attack compared to older people.  People around 40 are particularly vulnerable. Second, maximum heart rate beyond 150 is a strong indicator of heart attacks. Third, small values (smaller than one) of old peak are worrisome. 
#### We draw some joint plots to further confirm our observation about these features. We can see sharp slope in the figures below.

In [None]:
sns.jointplot(data=df,x= 'age',  y='output', kind='reg',color='b')

In [None]:
sns.jointplot(data=df,x= 'thalachh',  y='output', kind='reg',color='b')

In [None]:
sns.jointplot(data=df,x= 'oldpeak',  y='output', kind='reg',color='b')

## 3.3 Next, lets analyze discrete features using countplot

In [None]:
fig, ax = plt.subplots(4, 2,figsize=(16,16))
plt.subplot(4,2,1)
sns.countplot(data=df, x='sex', hue='output',palette=["C9", "C3"])
#ax[0,0].set_xlabel('Sex')
plt.subplot(4,2,2)
sns.countplot(data=df, x='cp', hue='output',palette=["C9", "C3"])
#ax[0,1].set_xlabel('Type of chest pain')
plt.subplot(4,2,3)
sns.countplot(data=df, x='fbs', hue='output',palette=["C9", "C3"])
#ax[1,0].set_xlabel('Fasting blood sugar')
plt.subplot(4,2,4)
sns.countplot(data=df, x='restecg', hue='output',palette=["C9", "C3"])
#ax[1,1].set_xlabel("Resting electrocardiographic results")
plt.subplot(4,2,5)
sns.countplot(data=df, x='exng', hue='output',palette=["C9", "C3"])
#ax[2,0].set_xlabel("Exercise induced angina")
plt.subplot(4,2,6)
sns.countplot(data=df, x='slp', hue='output',palette=["C9", "C3"])
plt.subplot(4,2,7)
sns.countplot(data=df, x='caa', hue='output',palette=["C9", "C3"])
#ax[3,0].set_xlabel("Number of major vessels")
plt.subplot(4,2,8)
sns.countplot(data=df, x='thall', hue='output',palette=["C9", "C3"])

#### We observe some strong correlations from these plots. The following feature values are sharp indicators of heart attack:

    - Number of major vessles equals to zero
    - Thall equals to two
    - slp equals to two 
    - Anginas that are not exercise induced
    - Having ST-T wave abnormality in resting electrocardiographic results
    - Having atypical angina or non-anginal chest pain
#### Moreover, men are more likely to have heart attack than women.

# 4. Building models

## 4.1 Importing libraries.  We are going to use Logistic regression, Random Forest and Boosted Trees to make predictions.

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost
from xgboost import XGBClassifier
from xgboost import plot_tree
from sklearn.metrics import  accuracy_score

## 4.2 Prepare the train and test sets

In [None]:
target=df['output']

In [None]:
df.drop(columns='output',inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, target ,test_size=0.33,random_state=42)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

## 4.3 Logistic regression 

In [None]:
logreg = LogisticRegression(solver='lbfgs', max_iter=5000)
logreg.fit(X_train, y_train)

In [None]:
Y_pred = logreg.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# 4.4 Random forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state = 1)
random_forest.fit(X_train, y_train)

In [None]:
Y_pred = random_forest.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# 4.5 XGBoost

In [None]:
xgb = XGBClassifier(use_label_encoder=False)
xgb.fit(X_train,y_train)

In [None]:
Y_pred = xgb.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Conclusion: the three methods have comparable accuracy

## 4.6 Visualize the decision tree

#### By default Xgboost combine 100 trees to make the final decision. The figure belows shows the 30'th tree in the forest. We can see that the features near the root are the ones deemed significant by our exploratory analysis. This is one big advantage of decision tree over logistic regression: it is more interpretable.

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
plot_tree(xgb, num_trees=30, ax=ax)
plt.show()