

# Contents of the Notebook :
 
# Part1: Exploratory Data Analysis(EDA):
#### 1)Analysis of the features.

#### 2)Finding any relations or trends considering multiple features.

# Part2: Feature Engineering and Data Cleaning:

#### 1)Converting features into suitable form for modeling.

# Part3: Predictive Modeling
#### 1)Running Basic Algorithms.



In [None]:
import numpy as np
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 
warnings.filterwarnings('ignore')

## Data Check 

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
data.head()
# output = target 

The target value is column 'output' !!! 

In [None]:
data.describe(include='all')

In [None]:
data.isnull().sum()

Great! There is no missing valeus in our data.

Now let's check out all the data **dtypes** and **unique values** before EDA. We are going to conducte separately according to dtypes.

In [None]:
# dtypes 
data.dtypes

In [None]:
# Also check out the unique valeus in each columns

dict = {}
for i in list(data.columns):
    dict[i] = data[i].value_counts().shape[0]

pd.DataFrame(dict, index=['unique count']).transpose()

Now we can seperate our columns into categorical features and continouse features. 

**Categorical Features**:   sex, exng, caa, cp, fbs, restecg, slp, thall

**Continous Features**:   age, trbps, chol, thalachh, oldpeak

**Target Feature**:   output

# Part1: Exploratory Data Analysis(EDA):

* **Age** : Age of the patient
* **Sex** : Sex of the patient
* **exang**: exercise induced angina (1 = yes; 0 = no)
* **cp** : Chest Pain type chest pain type

     -Value 1: typical angina

     -Value 2: atypical angina

     -Value 3: non-anginal pain

     -Value 4: asymptomatic
* **ca**: number of major vessels (0-3)
* **trtbps** : resting blood pressure (in mm Hg)
* **chol** : cholestoral in mg/dl fetched via BMI sensor
* **fbs** : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* **rest_ecg** : resting electrocardiographic results

     -Value 0: normal

     -Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

     -Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

* **thalach** : maximum heart rate achieved
* **target** : 0= less chance of heart attack 1= more chance of heart attack

**First of all, we will going to check out all the features of our data**

## Every Categorical Features Distribution

**Categorical Features**:   sex, exng, caa, cp, fbs, restecg, slp, thall


In [None]:
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)



ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])
ax6 = fig.add_subplot(gs[2,0])
ax7 = fig.add_subplot(gs[2,1])
ax8 = fig.add_subplot(gs[2,2])

ax_sex = ax0
sns.countplot(x='sex', data=data, ax=ax_sex, palette='YlGnBu')
sns.despine()

ax_exng = ax1
sns.countplot(x='exng',data=data, ax=ax_exng, palette='YlGnBu')
sns.despine()

ax_caa = ax2
sns.countplot(x='caa', data=data, ax=ax_caa, palette='YlGnBu')
sns.despine()

ax_cp = ax3
sns.countplot(x='cp', data=data, ax=ax_cp, palette='YlGnBu')
sns.despine()

ax_fbs = ax4
sns.countplot(x='fbs',data=data, ax=ax_fbs, palette='YlGnBu')
sns.despine()

ax_restecg = ax5
sns.countplot(x='restecg',  data=data, ax=ax_restecg, palette='YlGnBu')
sns.despine()

ax_slp = ax6
sns.countplot(x='slp',  data=data, ax=ax_slp, palette='YlGnBu')
sns.despine()

ax_thall = ax7
sns.countplot(x='thall',  data=data, ax=ax_thall, palette='YlGnBu')
sns.despine()

ax8.spines["bottom"].set_visible(False)
ax8.spines["left"].set_visible(False)
ax8.spines["top"].set_visible(False)
ax8.spines["right"].set_visible(False)
ax8.tick_params(left=False, bottom=False)
ax8.set_xticklabels([])
ax8.set_yticklabels([])


plt.show();

## Categorical Features & Target Feature

In [None]:
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
"""sns.set_style("white")
sns.set_context("poster", font_scale = 0.5)"""

ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])
ax6 = fig.add_subplot(gs[2,0])
ax7 = fig.add_subplot(gs[2,1])
ax8 = fig.add_subplot(gs[2,2])

ax_sex = ax0
sns.countplot(x='sex', hue='output', data=data, ax=ax_sex, palette='YlGnBu')
sns.despine()

ax_exng = ax1
sns.countplot(x='exng', hue='output', data=data, ax=ax_exng, palette='YlGnBu')
sns.despine()

ax_caa = ax2
sns.countplot(x='caa', hue='output', data=data, ax=ax_caa, palette='YlGnBu')
sns.despine()

ax_cp = ax3
sns.countplot(x='cp', hue='output', data=data, ax=ax_cp, palette='YlGnBu')
sns.despine()

ax_fbs = ax4
sns.countplot(x='fbs', hue='output', data=data, ax=ax_fbs, palette='YlGnBu')
sns.despine()

ax_restecg = ax5
sns.countplot(x='restecg', hue='output', data=data, ax=ax_restecg, palette='YlGnBu')
sns.despine()

ax_slp = ax6
sns.countplot(x='slp', hue='output', data=data, ax=ax_slp, palette='YlGnBu')
sns.despine()

ax_thall = ax7
sns.countplot(x='thall', hue='output', data=data, ax=ax_thall, palette='YlGnBu')
sns.despine()

ax8.spines["bottom"].set_visible(False)
ax8.spines["left"].set_visible(False)
ax8.spines["top"].set_visible(False)
ax8.spines["right"].set_visible(False)
ax8.tick_params(left=False, bottom=False)
ax8.set_xticklabels([])
ax8.set_yticklabels([])

plt.show()

In [None]:
print('1)', data.groupby(['sex', 'output'])['output'].count())
print('')
print('')
print('2)', data.groupby(['exng', 'output'])['output'].count())
print('')
print('')
print('3)', data.groupby(['caa', 'output'])['output'].count())
print('')
print('')
print('4)', data.groupby(['cp', 'output'])['output'].count())
print('')
print('')
print('5)', data.groupby(['fbs', 'output'])['output'].count())
print('')
print('')
print('6)', data.groupby(['restecg', 'output'])['output'].count())
print('')
print('')
print('7)', data.groupby(['slp', 'output'])['output'].count())
print('')
print('')
print('8)', data.groupby(['thall', 'output'])['output'].count())


## Continous Features & Target

**Continous Features**:   age, trbps, chol, thalachh, oldpeak

In [None]:
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(2,3)

ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])


ax_age = ax0
sns.kdeplot(x='age', hue='output', data=data, fill=True, alpha=.5, linewidth=0, ax=ax_age, palette='YlGnBu', shade=True)
sns.despine()

ax_trtbps = ax1
sns.kdeplot(x='trtbps', hue='output', data=data, fill=True, alpha=.5, linewidth=0, ax=ax_trtbps, palette='YlGnBu', shade=True)
sns.despine()

ax_chol = ax2
sns.kdeplot(x='chol', hue='output', data=data, fill=True, alpha=.5, linewidth=0, ax=ax_chol, palette='YlGnBu', shade=True)
sns.despine()

ax_thalachh = ax3
sns.kdeplot(x='thalachh', hue='output', data=data, fill=True, alpha=.5, linewidth=0, ax=ax_thalachh, palette='YlGnBu', shade=True)
sns.despine()

ax_oldpeak = ax4
sns.kdeplot(x='oldpeak', hue='output', data=data, fill=True, alpha=.5, linewidth=0, ax=ax_oldpeak, palette='YlGnBu', shade=True)
sns.despine()



## Check Out the Outlier in Continous Features

In [None]:
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(2,3)

ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])


ax_age = ax0
sns.boxplot(x='age',data=data, ax=ax_age, palette='YlGnBu')
sns.despine()

ax_trtbps = ax1
sns.boxplot(x='trtbps',data=data, ax=ax_trtbps, palette='YlGnBu')
sns.despine()

ax_chol = ax2
sns.boxplot(x='chol',data=data, ax=ax_chol, palette='YlGnBu')
sns.despine()

ax_thalachh = ax3
sns.boxplot(x='thalachh',data=data, ax=ax_thalachh, palette='YlGnBu')
sns.despine()

ax_oldpeak = ax4
sns.boxplot(x='oldpeak',data=data, ax=ax_oldpeak, palette='YlGnBu')
sns.despine()


We can check out that there is some outlier in continous features. 

## Target Feature

In [None]:
fig = plt.figure(figsize=(14,8))
gs = fig.add_gridspec(3,4)
sns.set_style("white")
sns.set_context("poster", font_scale = 0.5)

ax_target = fig.add_subplot(gs[:2,:2])
sns.countplot(x='output', data=data, ax=ax_target, palette='YlGnBu')
sns.despine()



In [None]:
f,ax = plt.subplots(1,2, figsize=(15,8))


data.loc[data['output'] == 0].plot.hist(ax=ax[0], bins=20, edgecolor='black', color='lightgray')
ax[0].set_title('Age & target = 0')

data.loc[data['output'] == 1].plot.hist(ax=ax[1], bins=20, edgecolor='black', color='red')
ax[1].set_title('Age & target=1')

plt.show();

## Correlation Between Features 

In [None]:
data.corr()

In [None]:
fig = plt.figure(figsize=(14,8))
sns.heatmap(data.corr(), annot=True, cmap='YlGnBu')
plt.show()

# Part2: Feature Engineering and Data Cleaning:
First devide the columns into Categorical feature and Continuous Features.

I am going to use dummies values to categorical features and use StandardScaler to numerical features

In [None]:
cat_columns = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_columns = ["age","trtbps","chol","thalachh","oldpeak"]
target_column = ["output"]


Categorical Features

In [None]:
# Categorical Features 
data = pd.get_dummies(data=data, columns=cat_columns)
data.columns

Continuous Features

We found out some outlier in the continous features. So first before we start feature engineering, let's del all the outliers in the continous features. 

### But !!! Since the data is so small!! The result was bette when I didn't delect the outliers. 

In [None]:
"""# delect Outlier 
def outliers_iqr(data):
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    
    return np.where((data > upper_bound)|(data < lower_bound))

# delete 'trtbps', 'chol', 'thalachh', 'oldpeak'
trtbps_outlier = outliers_iqr(data['trtbps'])[0]
chol_outlier = outliers_iqr(data['chol'])[0]
thalachh_outlier = outliers_iqr(data['thalachh'])[0]
oldpeak_outlier = outliers_iqr(data['oldpeak'])[0]"""

In [None]:
"""data.loc[trtbps_outlier, 'trtbps']"""

In [None]:
"""data.loc[chol_outlier, 'chol']"""

In [None]:
"""data.loc[thalachh_outlier, 'thalachh']"""

In [None]:
"""data.loc[oldpeak_outlier, 'oldpeak']"""

In [None]:
"""# Concatenate all the array 
lead_outlier_index = np.concatenate((trtbps_outlier,
                                     chol_outlier,
                                     thalachh_outlier,
                                     oldpeak_outlier), axis=None)

print(len(lead_outlier_index))
lead_outlier_index"""

In [None]:
"""# Put into 'lead_not_outlier_index' which is not the outliers data 
lead_not_outlier_index = []

for i in data.index:
    if i not in lead_outlier_index:
        lead_not_outlier_index.append(i)
"""

In [None]:
"""data = data.loc[lead_not_outlier_index]
data = data.reset_index(drop=True)
data.columns"""

Now!! We delected all the outliers in the continous columns. 

Let's use the StandardScaler to do the feature engineering. 

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
data[con_columns] = scaler.fit_transform(data[con_columns])

data[con_columns]

In [None]:
data.head()

In [None]:
data['output'].value_counts

NOW! The feature engineering is clear! Next we are going to split the train-test set and go modeling ~!

## Train - Valid - Test split

In [None]:
x = data.drop('output', axis=1)
y = data['output'].values

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,
                                                    test_size=0.1,
                                                    random_state=52,
                                                    shuffle=True)

# Part3: Predictive Modeling

## 1) Running Basic Algorithms

We have gained some insights from the EDA part. But with that, we cannot accurately predict or tell whether a heart attack will occur or not.. So now we will predict by using some great Classification Algorithms. Following are the algorithms I will use to make the model:

1) Logistic Regression

2) Support Vector Machines(Linear and radial)

3) Random Forest

4) LightGBM

5) KNeighborClassifier

6) XGBoost

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score

In [None]:
# 1. LogisticRegression

lr = LogisticRegression()

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

print(f"Logistic Regression F1 Score: {f1_score(y_test, y_pred, average='micro')}")

In [None]:
# 2. Support Vector Machine

svc = SVC(probability=True)

svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

print(f"Support Vector Machine F1 Score: {f1_score(y_test, y_pred, average='micro')}")

In [None]:
# 3. Rnadom Forest

rf = RandomForestClassifier()

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print(f"RandomForest F1 Score: {f1_score(y_test, y_pred, average='micro')}")

In [None]:
# 4. LightGBM

lgb = LGBMClassifier()

lgb.fit(x_train, y_train)

y_pred = lgb.predict(x_test)

print(f"LightGBM F1 Score: {f1_score(y_test, y_pred, average='micro')}")

In [None]:
# 5. KNeighborsClassifier 

knn = KNeighborsClassifier()

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print(f"KNeighborsClassifier F1 Score: {f1_score(y_test, y_pred, average='micro')}")

### If you liked the notebook, consider giving an upvote. 
### Feel free to give me any comments !!!