## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# model selection and preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
#models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB



#Boosting Algorithms
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier


# metrics
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import confusion_matrix

# To deal with those annoying deprecated warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
raw_data = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
raw_data_copy = raw_data

In [None]:
raw_data_copy.head()

In [None]:
raw_data_copy.shape

# Data Description

In [None]:
raw_data_copy.isnull().sum()

NO null values

In [None]:
raw_data_copy.info()

### Duplicates

In [None]:
raw_data_copy.drop_duplicates(inplace=True)
raw_data_copy.reset_index(drop=True, inplace=True)
raw_data_copy.isnull().sum()

## Checking if the data is balanced for output or not

In [None]:
sns.countplot(raw_data_copy['output'])

In [None]:
ax= px.pie(raw_data_copy['output'], names= "output",title= "Output")
ax.show()

# EDA

In [None]:
columns=["age","cp","trtbps","chol","thalachh","oldpeak"]
fig, ax1 = plt.subplots(3,2, figsize=(20,20))
k = 0
for i in range(3):
    for j in range(2):
            sns.distplot(raw_data_copy[columns[k]], ax = ax1[i][j], color = 'green')
            k += 1
plt.show()

Only the "OldPeak" is the only feature which is highly skewewd.

In [None]:
plt.figure(figsize=(20,6))
sns.distplot(raw_data_copy["age"],color="red",bins="auto")
plt.title("Total age distribution")
plt.show()

In [None]:
ax= px.pie(raw_data_copy['sex'], names= "sex",title= "Gender Distribution")
ax.show()

In [None]:
data1 = raw_data_copy[raw_data_copy['sex'] == 1].reset_index()
data0 = raw_data_copy[raw_data_copy['sex'] == 0].reset_index()

In [None]:

ax = px.pie(data1, names = 'output',title= "Fatality in Gender 1")
ax.show()

In [None]:
ax = px.pie(data0,names = "output",title="Fatality in Gender 0")
ax.show()

We can clearly see that gender "0" is high risk at heart attack

In [None]:
fig,ax = plt.subplots(7,2,figsize = (20,36))
k = 0
for i in range(7):
    for j in range(2):
        sns.countplot(raw_data_copy[raw_data_copy.columns[k]],ax = ax[i][j])
        k+=1
plt.show()

In [None]:
columns=["age","cp","trtbps","chol","thalachh","oldpeak"]
fig, ax1 = plt.subplots(3,2, figsize=(20,20))
k = 0
for i in range(3):
    for j in range(2):
            sns.boxplot(raw_data_copy[columns[k]], ax = ax1[i][j], color = 'red',width = 0.5)
            k += 1
plt.show()

Here are some outliers present in our features

# Data Pre-Processing

In [None]:
corr = raw_data_copy.corr()

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(corr,annot=True)
plt.show()

In [None]:
corr['output']

We can see that the least related to our 'output' are 'fbs' and 'chol'.

# Outliers


Outliers:
It is the data that is way too large or way too low in respect of the other data.It highly affects the measures as a lot measure directly depend on all the data points.

IQR(inter quartile range):
It is the difference between Q1 and Q3 and is the range of middle 50% of the data. Ways to identify outliers:

- Find Q1,Q2,Q3
- Find IQR = Q3 - Q1
- Multiply IQR by 1.5
- Subtract this number from Q1 and add this number to Q3
- If the point lie in the above acquired range then it is not an outlier, if doesn't then it is an outlier.

In [None]:
# Q1 
q1 = raw_data_copy.quantile(0.25)
# Q3
q3 = raw_data_copy.quantile(0.75)
# IQR
IQR = q3 - q1
# Outlier range
upper = q3 + IQR * 1.5
lower = q1 - IQR * 1.5
upper_dict = dict(upper)
lower_dict = dict(lower)

In [None]:
for i,v in raw_data_copy.items():
    v_col = v[( v<= lower_dict[i]) | (v >= upper_dict[i])]
    perc = np.shape(v_col)[0] * 100.0 / np.shape(raw_data_copy)[0]
    print("Column {} outliers = {} => {}%".format(i,len(v_col),round((perc),3)))

We have "caa" , "trtbps" ,"chol" and "oldpeak" with some amount of outliers.

### Since dataset is already very small , we won't be dealing with them by removing the outliers rather we will replace them with acceptable upper or lower limit.

In [None]:
raw_data_copy['trtbps'].replace(list(raw_data_copy[raw_data_copy['trtbps'] > upper_dict['trtbps']].trtbps) ,upper_dict['trtbps'],inplace=True)
raw_data_copy['chol'].replace(list(raw_data_copy[raw_data_copy['chol'] > upper_dict['chol']].chol) ,upper_dict['chol'],inplace=True)
raw_data_copy['oldpeak'].replace(list(raw_data_copy[raw_data_copy['oldpeak'] > upper_dict['oldpeak']].oldpeak) ,upper_dict['oldpeak'],inplace=True)

In [None]:
raw_data_copy['trtbps'].replace(list(raw_data_copy[raw_data_copy['trtbps'] < lower_dict['trtbps']].trtbps) ,lower_dict['trtbps'],inplace=True)
raw_data_copy['chol'].replace(list(raw_data_copy[raw_data_copy['chol'] < lower_dict['chol']].chol) ,lower_dict['chol'],inplace=True)
raw_data_copy['oldpeak'].replace(list(raw_data_copy[raw_data_copy['oldpeak'] < lower_dict['oldpeak']].oldpeak) ,lower_dict['oldpeak'],inplace=True)

# Train Test Split

In [None]:
# This standardisation technique uses median and interquartile range for standardisation rather than mean and variance.
scaler = RobustScaler()
robust_df = scaler.fit_transform(raw_data_copy.iloc[:,:13])
robust_df = pd.DataFrame(robust_df, columns =['age','sex','cp','trtbps','chol','fbs','restecg','thalachh','exng','oldpeak','slp','caa','thall'])
robust_df

In [None]:
X = robust_df.values
# X = raw_data_copy.iloc[:,1:-1].values

Y = raw_data_copy['output'].values
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)
# Normalization of data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Logistic Regression

In [None]:
model_LG = LogisticRegression(random_state=0)
model_LG.fit(X_train,Y_train)
Y_pred = model_LG.predict(X_test)
model_LG_accuracy=round(accuracy_score(Y_test,Y_pred), 4)*100 # Accuracy
print(classification_report(Y_test,Y_pred))

# KNN

In [None]:
model_KNN = KNeighborsClassifier(n_neighbors=7)
model_KNN.fit(X_train,Y_train)
Y_pred = model_KNN.predict(X_test)
model_KNN_accuracy=round(accuracy_score(Y_test,Y_pred), 4)*100
print(classification_report(Y_test,Y_pred))

# Gaussian NB

In [None]:
model_NB = GaussianNB()
model_NB.fit(X_train,Y_train)
  
predicted = model_NB.predict(X_test)
print(classification_report(Y_test,predicted))

# SVM

In [None]:
model_svm=SVC(kernel="rbf",random_state=0)
model_svm.fit(X_train,Y_train)
Y_pred=model_svm.predict(X_test)

model_svm_accuracy=round(accuracy_score(Y_test,Y_pred), 4)*100
print(classification_report(Y_test,Y_pred))

# Random Forest Classifier

In [None]:

model_RF = RandomForestClassifier(n_estimators = 100, random_state = 0)  
model_RF.fit(X_train, Y_train)  
predicted = model_RF.predict(X_test)
print(classification_report(Y_test,predicted))

# XGboost Classifier

In [None]:
model_XG = xgb.XGBClassifier(use_label_encoder=False)
model_XG.fit(X_train, Y_train)
   
predicted = model_XG.predict(X_test)
   
print(classification_report(Y_test,predicted))

# Ada Boost Classifier

In [None]:
model_ADA=AdaBoostClassifier(learning_rate= 0.15,n_estimators= 25,random_state=0)
model_ADA.fit(X_train,Y_train)
Y_pred= model_ADA.predict(X_test)


model_ADA_accuracy=round(accuracy_score(Y_test,Y_pred), 4)*100 # Accuracy
print(classification_report(Y_test,Y_pred))


# Gradient Boost Classifier

In [None]:

model_GB= GradientBoostingClassifier(random_state=0,n_estimators=20,learning_rate=0.29,loss="deviance")
model_GB.fit(X_train,Y_train)
Y_pred= model_GB.predict(X_test)

model_GB_accuracy=round(accuracy_score(Y_test,Y_pred), 4)*100 # Accuracy
print(classification_report(Y_test,Y_pred))

In [None]:

lgbm = LGBMClassifier(random_state=0)

lgbm.fit(X_train, Y_train)

y_pred = lgbm.predict(X_test)
print(classification_report(Y_test,Y_pred))

# Applying Grid Search on SVM

In [None]:
param_grid = {'C': [0.1,1,10,100,1000], 
              'degree' : [4,5,6,7,8,9],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 5)
  
# fitting the model for grid search
grid.fit(X_train, Y_train)

In [None]:

# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:

grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(Y_test, grid_predictions))

In [None]:
accuracy_score(Y_test,grid_predictions) * 100

# Conclusion:
* Most of the models are working brilliantly on this dataset after normalising the dataset.
* SVM and ADAboost are particularly best models.
* After using Grid search , Adaboost beats SVM.
* Only looking at accuracy as evaluation metrics in this case might be deadly as we need to look for **False Negative**.
* Hence , we are looking at complete classification report , especisally **Recall**