# Necessary Libaries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Import the Dataset

In [None]:
df = pd.read_csv('../input/heart-failure-prediction/heart.csv')

# Dataset Overview

| Variable | Definition | Guide |
| --- | --- | --- |
| Age | age of the patient | years |
| Sex | sex of the patient | M: Male, F: Female |
| ChestPainType | chest pain type | <br>TA: Typical Angina<br><br>ATA: Atypical Angina<br><br>NAP: Non-Anginal Pain<br><br>ASY: Asymptomatic<br> |
| RestingBP | resting blood pressure | mm Hg |
| Cholesterol | serum cholesterol | mm/dl |
| FastingBS | fasting blood sugar | 1: if FastingBS > 120 mg/dl<br><br>0: otherwise |
| RestingECG | resting electrocardiogram results | Normal: Normal<br><br>ST: having ST-T wave abnormality<br><br>LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria |
| MaxHR | maximum heart rate achieved | Numeric value between 60 and 202 |
| ExerciseAngina | exercise-induced angina | Y: Yes, N: No |
| Oldpeak | oldpeak = ST | Numeric value measured in depression |
| ST_Slope | the slope of the peak exercise ST segment | Up: upsloping, Flat: flat, Down: downsloping |
| HeartDisease | output class | 1: heart disease, 0: Normal |

In [None]:
df

In [None]:
df.shape

In [None]:
df.info()

# EDA (Exploratory Data Analysis)

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(18, 11))
matrix = np.triu(df.corr())
sns.heatmap(df.corr(),center= 0, annot=True, linewidth=0.8, mask=matrix)

plt.title('Features Correlation Heatmap', fontsize = 25)

In [None]:
fig, axes = plt.subplots(2,2, figsize = (15,8))
fig.suptitle('Useful Histograms', fontsize = 18)

sns.histplot(data = df, x = 'Age', kde= True, bins = 25, ax = axes[0,0])
sns.histplot(data = df, x = 'RestingBP', kde= True, bins = 25, ax = axes[0,1])
sns.histplot(data = df, x = 'Oldpeak', kde= True, bins = 25, ax = axes[1,0])
sns.histplot(data = df, x = 'Cholesterol', kde= True, bins = 25, ax = axes[1,1])

In [None]:
fig, axes = plt.subplots(3,2, figsize = (15,15))
fig.suptitle('Useful Countplots', fontsize = 18)

sns.countplot(data = df, x = 'HeartDisease', ax = axes[0,0])
sns.countplot(data = df, x = 'Sex', ax = axes[0,1])
sns.countplot(data = df, x = 'RestingECG', ax = axes[1,0])
sns.countplot(data = df, x = 'ChestPainType', ax = axes[1,1])
sns.countplot(data = df, x = 'ExerciseAngina', ax = axes[2,0])
sns.countplot(data = df, x = 'FastingBS', ax = axes[2,1])

In [None]:
fig, axes = plt.subplots(2,2, figsize = (16,10))
fig.suptitle('Useful Boxplots', fontsize = 18)

sns.boxplot(data = df, x = 'HeartDisease', y = 'Age', ax = axes[0,0])
sns.boxplot(data = df, x = 'Sex', y = 'Age', ax = axes[0,1])
sns.boxplot(data = df, x = 'HeartDisease', y = 'MaxHR', ax = axes[1,0])
sns.boxplot(data = df, x = 'FastingBS', y = 'Cholesterol', ax = axes[1,1])

# Data Cleaning

In [None]:
df.isnull().sum()

no missing data

according to Oldpeak histogram there are some negative data in the dataset... we need to deal with them cause oldpeak can not be negative...

In [None]:
df[df['Oldpeak']<0]

In [None]:
df.loc[df['Oldpeak']<0,'Oldpeak'] = df[df['Oldpeak']<0]['Oldpeak']*(-1)

In [None]:
df[df['Oldpeak']<0]

also in cholesterol histogram we can see there are some zero values that we want to deal with them.

In [None]:
len(df[df['Cholesterol'] == 0].index)/len(df.index)

 %18.7 of our dataset have zero cholesterol.

**cholesterol** is highly correlated with **FastingBS** So it's better to guess the **cholesterol** of the patients according to their **FastingBS**...

In this section we use the **median** to keep the cholesterol distribution in each fasting blood sugar class unchanged. 

In [None]:
for i in range(2):
    
    df.loc[(df['Cholesterol'] == 0) & (df.FastingBS == i), 'Cholesterol'] = df[(df['FastingBS'] == i) & (df['Cholesterol'] != 0)]['Cholesterol'].median()

Next feature that need to be cleaned is **RestingBP**...
according to the histogram it has some zero values. we know that blood pressure can not possibly be zero...
and according to the heatmap **RestingBP** is highly correlated with **Age**

so same as previous feature we are going to fill the zero values of resting blood pressure...

In [None]:
df[df['RestingBP'] == 0]

In [None]:
df.loc[df['RestingBP'] == 0, 'RestingBP'] = df['RestingBP'].median()

# Categorical Features (Dummy variables)

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.head()

Ready to build some models

# Model Building

In [None]:
X = df.drop('HeartDisease', axis = 1)
y = df['HeartDisease']

### defining an evaluation function

In [None]:
#libraries
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import plot_precision_recall_curve, plot_roc_curve

In [None]:
def Eval_Report(model, y_pred):
    
    print('********* EVALUATION REPORT ********* \n')
    print('CONFUSION MATRIX: \n')
    print(confusion_matrix(y_test, y_pred))
    print('\nCLASSIFICATION REPORT: \n')
    print(classification_report(y_test, y_pred))
    print('\nPLOTS:\n')
    
    plot_confusion_matrix(model, X_test, y_test)
    plot_precision_recall_curve(model, X_test, y_test)
    plot_roc_curve(model, X_test, y_test)

# Split the Dataset to Train and Test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# 1) SVM

In [None]:
# libraries
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

### Pipeline

In [None]:
svm_pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC(random_state = 101))])

In [None]:
param_grid = {'svm__C':[0.01,0.1,1, 10, 100, 1000],
              'svm__gamma':['auto', 'scale', 1, 0.1, 0.01, 0.001, 0.0001],
              'svm__kernel': ['rbf', 'poly'],
              'svm__shrinking': [True, False]}

svm_grid = GridSearchCV(svm_pipe,param_grid, cv=5)

In [None]:
svm_grid.fit(X_train, y_train)

In [None]:
svm_grid.best_params_

### Predict

In [None]:
ypred_svm_grid= svm_grid.predict(X_test)

### Evaluation

In [None]:
Eval_Report(svm_grid, ypred_svm_grid)

# 2) Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

### Pipeline

In [None]:
DT_pipe = Pipeline([('scaler', StandardScaler()), ('DT', DecisionTreeClassifier(random_state=101))])

In [None]:
param_grid= {'DT__max_features': ['auto','sqrt','log2'],
            'DT__criterion': ['gini','entropy']}

DT_grid=GridSearchCV(DT_pipe, param_grid, cv=5)

In [None]:
DT_grid.fit(X_train,y_train)

In [None]:
DT_grid.best_params_

### Predict

In [None]:
ypred_DT_grid = DT_grid.predict(X_test)

### Evaluation

In [None]:
Eval_Report(DT_grid, ypred_DT_grid)

# 3) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Pipeline

In [None]:
RF_pipe = Pipeline([('scaler', StandardScaler()), ('RF', RandomForestClassifier(random_state=101))])

In [None]:
param_grid= {'RF__n_estimators':[64,70, 78, 88, 95, 100, 120],
            'RF__bootstrap':[True, False],
            'RF__max_features': ['auto','sqrt','log2']}

RF_grid=GridSearchCV(RF_pipe, param_grid, cv=5)

In [None]:
RF_grid.fit(X_train, y_train)

In [None]:
RF_grid.best_params_

### Predict

In [None]:
ypred_RF_grid= RF_grid.predict(X_test)

### Evaluation

In [None]:
Eval_Report(RF_grid, ypred_RF_grid)

# Conclusion

Our SVM model is the best because with the same accuracy as Random Forest it has more AP and AUC...

Decision Tree is the worst by far :)