In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### About this dataset
*  Age : Age of the patient

* Sex : Sex of the patient

* exang: exercise induced angina (1 = yes; 0 = no)

* ca: number of major vessels (0-3)

* cp : Chest Pain type chest pain type

* Value 1: typical angina
* Value 2: atypical angina
* Value 3: non-anginal pain
* Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)

* chol : cholestoral in mg/dl fetched via BMI sensor

* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

* rest_ecg : resting electrocardiographic results

* Value 0: normal
* Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach : maximum heart rate achieved

* target : 0= less chance of heart attack 1= more chance of heart attack

# <center> Heart-Attack Analysis & Prediction  </center>

 <center> <img src="https://source.wustl.edu/wp-content/uploads/2019/02/HeartImage.jpg" height=300 width= 500 alt="Heart Attack Analysis"  > </center>


##### Importing libraries 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#### Following are the list of algorithm that are used in this notebook

| **Algorithm**       |
| : -----------  :    |
| Logistic Regression | 
| Decision Tree       | 
| Random Forest       | 
| XGBoost             | 
| KNeighbours         |
| SVM                 | 
| AdaBoost            | 

#####  In this following cell we have read the dataset using pandas.
##### It is considered as a good practice to make a copy of main data and work on the copy of dataset. 

In [None]:
main_df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df = main_df.copy()

In [None]:
# Getting top 5 rows
df.head()

In above cell we have listed out top 5 rows of the dataset.

In [None]:
# Dimension of dataframe
df.shape

We have 303 rows and 14 columns in our dataset

In [None]:
# List of all columns present in dataframe
df.columns

In [None]:
# To view some basic statistical details 
df.describe()

In our dataset Mean age is 54, Minimum age is 29, maximum age is 77, 25% of the people in our dataset have age less than 47 and 75% of the people in our dataset have age less than 61.

In [None]:
# getting the information about dataframe
df.info()

By above table we can see that non of our value is object type, all of them are numerical type with no missing value.

In [None]:
#  check for null value 
df.isnull().sum()

In [None]:
# checking number of unique values in each column
df.nunique()

## Visualization 

In [None]:
# Checking null value using heatmap
sns.heatmap(df.isnull())

No dot/marks are present in the graph (red region) it means we donot have any missing value.

In [None]:
# correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot = True, cmap='coolwarm')

Two features can be positively correlated or  negatively. But if they are highly correlated then we should drop one of them.

In [None]:
ax = sns.countplot(x = "sex",data= df, saturation=0.8)
plt.xticks(ticks=[0, 1], labels = ["female", "male"])
plt.show()

In [None]:
# Visualizing dataset and also checking for outliers 

fig, ax = plt.subplots(ncols = 7, nrows = 2, figsize = (20, 10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    sns.boxplot(y=col, data=df, ax=ax[index])
    index += 1
plt.tight_layout(pad = 0.5, w_pad=0.7, h_pad=5.0)

In [None]:
# Individual box plot for each feature
def Box(df):
    plt.title("Box Plot")
    sns.boxplot(df)
    plt.show()
Box(df['age'])

In [None]:
sns.histplot(x = "age", data=df)

In [None]:
# Min-Max normalization
# Here we are taking only 4 column for normalization because in this the value are too high as compare to others

cols = ['trtbps', 'chol', 'thalachh', 'oldpeak', 'age']
for col in cols:
    minimum = min(df[col])
    maximum = max(df[col])
    df[col] = (df[col] - minimum)/ (maximum - minimum)

In [None]:
df.head()

In [None]:
print(df[["sex", "output"]].groupby(['sex']).mean())

In [None]:
print(df[["cp", "output"]].groupby(['cp']).mean())

In [None]:
print(df[["fbs", "output"]].groupby(['fbs']).mean())


In [None]:
print (df[["exng", "output"]].groupby(['exng']).mean())

In [None]:
print (df[["exng", "output"]].groupby(['exng']).mean())

Here we are grouping the data based on different categories and therefore we can also check other features for more information.

In [None]:
# Visualizing after min-max normalization 
fig, ax = plt.subplots(ncols = 7, nrows = 2, figsize = (20, 10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    sns.boxplot(y=col, data=df, ax=ax[index])
    index += 1
plt.tight_layout(pad = 0.5, w_pad=0.7, h_pad=5.0)

In [None]:
# Here we can see that after min-max normalization values now ranges from 0 to 1
df.head()

In [None]:
# Exploring dataset through visualization
df1=df[df["output"] == 1]
sns.histplot(df1["thalachh"],bins=25, color="lightgreen");
plt.xlabel("Heart rate when outcome is 1")
plt.show()

In [None]:
# Exploring dataset through visualization
df2=df[df["output"]==0]
sns.histplot(df2["thalachh"],bins=25,  color="red");
plt.xlabel("Heart rate when outcome is 0")
plt.show()

In [None]:
# dropping 'output' from dataframe and saving dataframe in X which is now acting as input column
X = df.drop("output", axis=1)
X.shape

In [None]:
df.nunique()

In [None]:
fig =  px.pie (df, names = "sex", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig =  px.pie (df, names = "cp", hole = 0.4, template = "plotly_dark")
fig.show ()

In [None]:
fig =  px.pie (df, names = "slp", hole = 0.4, template = "gridon")
fig.show ()

In [None]:
fig =  px.pie (df, names = "caa", hole = 0.4, template = "gridon")
fig.show ()

In [None]:
fig = px.histogram (df, x = "chol",  facet_row = "output",  template = 'plotly_dark')
fig.show ()

In [None]:
fig = px.histogram (df, x = "thalachh",  facet_row = "output",  template = 'gridon')
fig.show ()

In [None]:
fig = px.scatter (df, x = "thalachh", y = "oldpeak", color = "output", template = "plotly_dark",  trendline="ols")
fig.show ()

In [None]:
fig = px.scatter (df, x = "trtbps", y = "chol", color = "output", template = "gridon",  trendline="ols")
fig.show ()

In [None]:
fig = px.scatter (df, x = "thalachh", y = "chol", color = "output", template = "plotly_dark",  trendline="lowess")
fig.show ()

In [None]:
sns.pairplot(data=df, vars=['thalachh', 'chol', 'trtbps'], \
             hue='output', kind='reg', diag_kind='kde', markers=['*','.'], size=5, palette='husl')

In [None]:
X.head()

In [None]:
# y have only 'output' column 
y = df['output']
y.shape

#### Performing train_test_split

In [None]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=42)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Creating model object
model_lg = LogisticRegression(max_iter=120,random_state=0, n_jobs=20)

In [None]:
# Training Model
model_lg.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_lg = model_lg.predict(X_test)

In [None]:
# Calculating Accuracy Score
lg = accuracy_score(y_test, pred_lg)
print(lg)

In [None]:
# confusion Maxtrix
cm1 = confusion_matrix(y_test, pred_lg)
sns.heatmap(cm1/np.sum(cm1), annot = True, fmt=  '0.2%', cmap = 'Reds')

* Here,  Type-1 Error is 6.56% which is also known as False Positive.
* Type-2 Error is 9.84% which is also known as False Negative.
* while other % value in the confusion matrix represents that they are correctly  predicted in their specific categories.

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Creating model object
model_dt = DecisionTreeClassifier( max_depth=4, random_state=42)

In [None]:
# Training Model
model_dt.fit(X_train,y_train)

In [None]:
# Making Prediction
pred_dt = model_dt.predict(X_test)

In [None]:
# Calculating Accuracy Score
dt = accuracy_score(y_test, pred_dt)
print(dt)

In [None]:
# confusion Maxtrix
cm2 = confusion_matrix(y_test, pred_dt)
sns.heatmap(cm2/np.sum(cm2), annot = True, fmt=  '0.2%', cmap = 'Reds')

* Here,  Type-1 Error is 4.92% which is also known as False Positive.
* Type-2 Error is 9.84% which is also known as False Negative.
* while other % value in the confusion matrix represents that they are correctly  predicted in their specific categories.

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Creating model object
model_rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=0.16, random_state=42)

In [None]:
# Training Model
model_rf.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_rf = model_rf.predict(X_test)

In [None]:
# Calculating Accuracy Score
rf = accuracy_score(y_test, pred_rf)
print(rf)

In [None]:
# confusion Maxtrix
cm3 = confusion_matrix(y_test, pred_rf)
sns.heatmap(cm3/np.sum(cm3), annot = True, fmt=  '0.2%', cmap = 'Reds')

* Here,  Type-1 Error is 4.92% which is also known as False Positive.
* Type-2 Error is 4.92% which is also known as False Negative.
* while other % value in the confusion matrix represents that they are correctly  predicted in their specific categories.

### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
# Creating model object
model_xgb = XGBClassifier(max_depth= 8, n_estimators= 125, random_state= 0,  learning_rate= 0.03, n_jobs=5)

In [None]:
# Training Model
model_xgb.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_xgb = model_xgb.predict(X_test)

In [None]:
# Calculating Accuracy Score
xgb = accuracy_score(y_test, pred_xgb)
print(xgb)

In [None]:
# confusion Maxtrix
cm4 = confusion_matrix(y_test, pred_xgb)
sns.heatmap(cm4/np.sum(cm4), annot = True, fmt=  '0.2%', cmap = 'Reds')

* Here,  Type-1 Error is 4.92% which is also known as False Positive.
* Type-2 Error is 9.84% which is also known as False Negative.
* while other % value in the confusion matrix represents that they are correctly  predicted in their specific categories.

### KNeighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Creating model object
model_kn = KNeighborsClassifier(n_neighbors=9, leaf_size=20)

In [None]:
# Training Model
model_kn.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_kn = model_kn.predict(X_test)


In [None]:
# Calculating Accuracy Score
kn = accuracy_score(y_test, pred_kn)
print(kn)

In [None]:
# confusion Maxtrix
cm5 = confusion_matrix(y_test, pred_kn)
sns.heatmap(cm5/np.sum(cm5), annot = True, fmt=  '0.2%', cmap = 'Reds')

* Here,  Type-1 Error is 6.56% which is also known as False Positive.
* Type-2 Error is 3.28% which is also known as False Negative.
* while other % value in the confusion matrix represents that they are correctly  predicted in their specific categories.

##  SVM

In [None]:
from sklearn.svm import SVC, LinearSVC

In [None]:
model_svm = SVC(kernel='rbf', random_state = 42)

In [None]:
model_svm.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_svm = model_svm.predict(X_test)

In [None]:
# Calculating Accuracy Score
sv = accuracy_score(y_test, pred_svm)
print(sv)

In [None]:
# confusion Maxtrix
cm6 = confusion_matrix(y_test, pred_svm)
sns.heatmap(cm6/np.sum(cm6), annot = True, fmt=  '0.2%', cmap = 'Reds')

* Here,  Type-1 Error is 6.56% which is also known as False Positive.
* Type-2 Error is 6.56% which is also known as False Negative.
* while other % value in the confusion matrix represents that they are correctly  predicted in their specific categories.

## AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
model_ada = AdaBoostClassifier(learning_rate= 0.002,n_estimators= 205,random_state=42)

In [None]:
model_ada.fit(X_train, y_train)

In [None]:
# Making Prediction
pred_ada = model_ada.predict(X_test)

In [None]:
# Calculating Accuracy Score
ada = accuracy_score(y_test, pred_ada)
print(ada)

In [None]:
# confusion Maxtrix
cm7 = confusion_matrix(y_test, pred_ada)
sns.heatmap(cm7/np.sum(cm7), annot = True, fmt=  '0.2%', cmap = 'Reds')

### Accuracy score dataframe

In [None]:
models = pd.DataFrame({
    'Model':['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'KNeighbours', 'SVM', 'AdaBoost'],
    'Accuracy_score' :[lg, dt, rf, xgb, kn, sv, ada]
})
models
sns.barplot(x='Accuracy_score', y='Model', data=models)

models.sort_values(by='Accuracy_score', ascending=False)

### Conclusion :- After few hyperparameter tuning Random Forest and  Kneighbour Achieved the highest accuracy here 