#### Hello all, Welcome to my kernel. I m going to perform complete EDA for more chances of Heart Attack and Prediction with 93% accuracy. Let's start !

# Features:

1. age: Age of the patient

2. sex: Sex of the patient

3. cp: Chest pain type, 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic

4. trtbps: Resting blood pressure (in mm Hg)

5. chol: Cholestoral in mg/dl fetched via BMI sensor

6. fbs: (fasting blood sugar > 120 mg/dl), 1 = True, 0 = False

7. restecg: Resting electrocardiographic results, 0 = Normal, 1 = ST-T wave normality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria

8. thalachh: Maximum heart rate achieved

9. oldpeak: Previous peak

10. slp: Slope

11. caa: Number of major vessels

12. thall: Thalium Stress Test result, (0-3)

13. exng: Exercise induced angina, 1 = Yes, 0 = No

14. output:  0 = less chance of heart attack, 1 = more chance of heart attack

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

# Loading the dataset (heart.csv)

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
data.head()

**I'm going to take copy of original data for visualization**

In [None]:
df1 = data.copy()

# Data Preprocessing

In [None]:
df1.info()

In [None]:
df1.shape

In [None]:
df1.duplicated().sum()

In [None]:
df1[df1.duplicated()]

In [None]:
df1.drop_duplicates(inplace=True) # dropping duplicated rows
df1.reset_index(drop=True, inplace=True)
df1.shape

In [None]:
df1.isnull().sum() # Checking null values

**Now, I'm going to change column values for better understanding**

In [None]:
df1['exng'] = df1['exng'].map({1:'yes',0:'no'})
df1['cp'] = df1['cp'].map({0:'typical angina',1:'atypical angina',2:'non-anginal pain',3:'asymptomatic'})
df1['fbs'] = df1['fbs'].map({1:'true',0:'false'})
df1['restecg'] = df1['restecg'].map({0:'normal',1:'having ST-T wave abnormality',2:'showing probable or definite left ventricular hypertrophy'})
df1['output'] = df1['output'].map({0:'less chance of heart attack',1:'more chance of heart attack'})
df1.sample(5)

# Correlation

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df1.corr(),annot=True,cmap='coolwarm')

# Countplot

In [None]:
plt.figure(figsize=(20,30))
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)

plt.subplot(4,2,1)
plt.title('Prevalence of Heart attack by Sex',fontsize=15)
sns.countplot(df1['output'], hue=df1['sex'])

plt.subplot(4,2,2)
plt.title('Prevalence of Heart attack by Chest Pain',fontsize=15)
sns.countplot(df1['output'], hue=df1['cp'])

plt.subplot(4,2,3)
plt.title('Prevalence of Heart attack by fasting blood sugar > 120 mg/dl',fontsize=15)
sns.countplot(df1['output'],hue=df1['fbs'])

plt.subplot(4,2,4)
plt.title('Prevalence of Heart attack by restecg',fontsize=15)
sns.countplot(df1['output'],hue = df1['restecg'])

plt.subplot(4,2,5)
plt.title('Prevalence of Heart attack by Exercise induced angina',fontsize=15)
sns.countplot(df1['output'],hue=df1['exng'])

plt.subplot(4,2,6)
plt.title('Prevalence of Heart attack by slp',fontsize=15)
sns.countplot(df1['output'],hue=df1['slp'])

plt.subplot(4,2,7)
plt.title('Prevalence of Heart attack by number of major vessels',fontsize=15)
sns.countplot(df1['output'],hue=df1['caa'])

plt.subplot(4,2,8)
plt.title('Prevalence of Heart attack by thall',fontsize=15)
sns.countplot(df1['output'],hue=df1['thall'])

# Distplot

In [None]:
plt.figure(figsize=(20,15))
plt.subplot(2,2,1)
plt.title('Prevalence of Heart attack by age',fontsize=15)
sns.histplot(x = df1['age'], hue = df1['output'])

plt.subplot(2,2,2)
plt.title('Prevalence of Heart attack by Maximum heart rate',fontsize=15)
sns.histplot(x = df1['trtbps'], hue = df1['output'])

plt.subplot(2,2,3)
plt.title('Prevalence of Heart attack by cholestoral in mg/dl',fontsize=15)
sns.histplot(x = df1['chol'], hue = df1['output'])

plt.subplot(2,2,4)
plt.title('Prevalence of Heart attack by old peak',fontsize=15)
sns.histplot(x = df1['oldpeak'],hue = df1['output'])

# Jointplot

In [None]:
sns.jointplot(x = df1['chol'], y = df1['trtbps'], kind = 'hist',color = 'pink')
sns.jointplot(x = df1['trtbps'], y = df1['thalachh'], kind = 'hex', color = 'green')
sns.jointplot(x = df1['chol'], y = df1['thalachh'], kind = 'kde', color = 'grey')

# FacetGrid

In [None]:
plt.figure(figsize = (20,20))
g = sns.FacetGrid(df1, col="cp", hue="output")
g.map(sns.scatterplot,'age','chol')
g.set_axis_labels('age', 'cholestoral')
g.add_legend()

# Lineplot & Regplot

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(x = df1['age'], y = df1['thall'],marker = '*', linestyle = '--', color = 'red')

plt.figure(figsize = (20,5))
sns.regplot(x=df1['age'],y=df1['oldpeak'],color='black')

# Barplot & Boxplot

In [None]:
plt.figure(figsize = (20,5))
sns.barplot(x=df1['slp'],y=df1['thalachh'],hue=df1['output'],palette = 'rainbow')

plt.figure(figsize = (20,10))
plt.xticks(fontsize=13)
sns.boxplot(x=df1['restecg'],y=df1['age'],hue=df1['output'],palette = 'rainbow')

# Swarmplot

In [None]:
plt.figure(figsize = (10,10))
sns.swarmplot(x=df1['caa'],y=df1['age'],hue=df1['output'])

# Violinplot

In [None]:
plt.figure(figsize = (20,8))
sns.violinplot(x=df1['thall'],y=df1['age'],hue=df1['output'])

## Conditions that leads to more chance of Heart Attack are:

* In general, 40 to 58 aged people have more chance of heart attack.

* Chest Pain - Non-anginal type

* Resting Blood Pressure - range of 120 to 140

* Cholesterol level - 200 to 260

* Electrocardiograph - having ST-T wave normality

* Maximum Heart Rate - above 150

* Slope - 2

* Number of major vessels - 0

* Thalium Stress Test Result - 2

* Old peak - between 0-0.5

# Prediction

Taking copy from Original Data

In [None]:
df2 = data.copy()
df2.head()

## Dropping Duplicate values

In [None]:
df2.drop_duplicates(inplace=True)
df2.reset_index(drop=True, inplace=True)
df2.isnull().sum()

In [None]:
df2.reset_index(drop=True,inplace=True)
df2.head()

Now, I'm going to perform baseline modelling

In [None]:
x = df2.iloc[:, :-1].values
y = df2.iloc[:, -1].values

## Standardization

In [None]:
# Spliting the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## Machine Learning Models

In [None]:
key = ['LogisticRegression','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier','GradientBoostingClassifier','AdaBoostClassifier','XGBClassifier']
value = [LogisticRegression(),KNeighborsClassifier(),SVC(),DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier(),AdaBoostClassifier(),xgb.XGBClassifier()]
models = dict(zip(key,value))
print(models)

In [None]:
predicted =[]
for name,algo in models.items():
    model=algo
    model.fit(x_train,y_train)
    predict = model.predict(x_test)
    acc = accuracy_score(y_test, predict)
    predicted.append(acc)
    print(name,acc)

## Accuracy Visualization

In [None]:
plt.figure(figsize = (10,5))
sns.barplot(x = predicted, y = key)

It shows SVC is the best fit for this dataset because the accuracy 93%. Let's try to increase the efficiency by Hyper tuning.

## Hyper tuning

In [None]:
svc = SVC(kernel = 'rbf',C = 1, degree = 5, random_state = 0)
svc.fit(x_train,y_train)
svc_pred = svc.predict(x_test)
print("Accuracy:",accuracy_score(y_test,svc_pred))
print("Confusion matrix:\n",confusion_matrix(y_test,svc_pred))

The accuracy remains same.

## Visualizing Best fit model

In [None]:
plt.figure(figsize = (10,5))
sns.distplot(svc_pred, label = 'Predicted')
sns.distplot(y_test, label = 'Actual')

**Hope you like my work. Please upvote me if you like it and leave your feedback in comments. Share this to your friends. Thanks for viewing this.**