In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings  
warnings.filterwarnings('ignore')
sns.set(font_scale=1.3)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### ****Loading and Understanding the Data****

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df.head()

### ****Getting Information from the Data****

In [None]:
df.info()

### **Checking Null Values**

In [None]:
df.isnull().sum()

### **Checking Duplicates and Removing**

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

# **Exploratory Data Analysis**

### **Percentage of Heart Attack Chances**

In [None]:
plt.figure(figsize=(8,7))

my_ticks = ['Less Chance', 'More Chance']

ax = sns.countplot(x='output', data=df, palette='rocket')
ax.set_xticklabels(my_ticks)
plt.xlabel('Output')
plt.ylabel('Count')
plt.title('Chances of Heart Attack', fontsize=18)

for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{round(height*100/302,2)}%', (x + width/2, y + height*1.01), ha='center')

plt.show()

As we can see above that there are 54.3% of dataset which has high chance of Heart Attack where 45.7% are with Low Chance.

### **Gender Distribution in DataSet**

In [None]:
plt.figure(figsize=(8,7))

my_ticks = ['Female', 'Male']

ax = sns.countplot(x='sex', data=df, palette='viridis')
ax.set_xticklabels(my_ticks)
plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Gender Distribution', fontsize=18)

for p in ax.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{round(height*100/302,2)}%', (x + width/2, y + height*1.01), ha='center')

plt.show()

We have 68.21% Male and 31.79% Female in our DataSet. (Assuming, 1=Male and 0=Female)

### **Gender Distribution wrt Heart Attack**

In [None]:
plt.figure(figsize=(9,7))

my_ticks = ['No Heart Attack', 'Heart Attack']
my_legends = ['Female', 'Male']

sns.set(font_scale=1.3)
ax = sns.countplot(x = 'output', hue='sex', data=df, palette='husl')
ax.set_xticklabels(my_ticks)

plt.xlabel('Output')
plt.ylabel('Count')
plt.legend(title = 'Gender', labels = my_legends)
plt.title('Gender Distribution wrt Heart Attack', fontsize=18)


plt.show()

### **Age Distribution in our DataSet**

In [None]:
plt.figure(figsize=(8,7))

sns.set_theme()
sns.set(font_scale=1.3)
sns.displot(x='age', data=df, color='Chocolate', height=7, aspect=1.1)

plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution', fontsize=18)

plt.show()

As we can see our maximum dataset ranges of age between 50 - 60.

### **Plotting No of People vs Age with Low or High Risk of Heart Attack**

In [None]:
v = pd.crosstab(df["age"],df["output"]).reset_index()
v.columns = ['Age', 'Low Risk', 'High Risk']

v.head()

In [None]:
plt.figure(figsize=(12,8))

sns.lineplot(x = 'Age', y='Low Risk', data=v, 
             marker = 'o' ,color = 'darkgreen', lw=2)
sns.lineplot(x = 'Age', y='High Risk', data=v, 
             marker = 'o', color = 'red', lw=2)

plt.legend(title = 'Risk', labels = ['Low Risk', 'High Risk'])
plt.xlabel('Age')
plt.ylabel('No. of People')
plt.title('Low Risk vs High Risk wrt Age', fontsize=18)

- Here we can see there is no significant relation of Age with Heart Attack Risk.
- One point can be seen here is that there are more number of people between age 55-60 who has Low Risk of Heart Attack.

## **CATEGORICAL FEATURES**

### **CountPlot of Categorical Features**

In [None]:
cat = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']



for i in range(len(cat)):
    
    plt.figure(figsize=(20,55))
    
    plt.subplot(len(cat),2,1)
    sns.countplot(x=cat[i] , data=df, palette='mako')
    plt.xlabel(cat[i].capitalize(), fontsize=20)
    plt.ylabel('Count', fontsize=20)
    if i == 0:
        plt.title('Countplot of Categorical Features', fontsize=22)
    
    plt.subplot(len(cat),2,2)
    sns.countplot(x=cat[i], hue='output', data = df, palette='magma_r')
    plt.xlabel(cat[i].capitalize(), fontsize=20)
    plt.ylabel('Count', fontsize=20)
    plt.legend(title = 'Output')
    if i == 0:
        plt.title('Countplot of Categorical Features wrt Target', fontsize=22)
    
    plt.show()

## **CONTINUOUS FEATURES**

### **Histplot of Continuous Features**

In [None]:
cont = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']


for i in range(len(cont)):
    
    plt.figure(figsize=(20,40))
    
    plt.subplot(len(cont),2,1)
    sns.histplot(x=cont[i] , data=df, color='Olive', kde=True)
    plt.xlabel(cont[i].capitalize(), fontsize=20)
    plt.ylabel('Count', fontsize=20)
    if i == 0:
        plt.title('Histplot of Continuous Features', fontsize=22)
    
    plt.subplot(len(cont),2,2)
    sns.histplot(x=cont[i], hue='output', data = df, palette='magma_r', kde=True)
    plt.xlabel(cont[i].capitalize(), fontsize=20)
    plt.ylabel('Count', fontsize=20)
    if i == 0:
        plt.title('Histplot of Continuous Features wrt Target', fontsize=22)
    
    plt.show()

### **Pairplot of Continuous Features**

Plotting pairplots for Continuous features to see any significant correlation among the features.

In [None]:
pair = df[['age', 'trtbps', 'chol', 'thalachh', 'oldpeak', 'output']]


ax = sns.pairplot(pair, hue='output', palette='rocket')
ax._legend.remove()
ax.fig.legend(title='Output', labels = ['Heart Attack', 'No Heart Attack'])
ax.fig.subplots_adjust(right=0.8, top=1)

plt.show()

#### Plotting Correlation Heatmap to find the correlation among columns

In [None]:
plt.figure(figsize=(20,10))

sns.heatmap(df.corr(), annot=True, cmap='YlGnBu')

plt.show()

# **MACHINE LEARNING & PREDICTION**

### **Data Preparation**

**Converting the data to similar typecast**

In [None]:
df.info()

In [None]:
df = df.astype(float)

In [None]:
df.info()

In [None]:
df.head()

**Performing Standardization (Normalizing the data)**

In [None]:
norm_df = (df - df.mean()) / df.std()

In [None]:
norm_df.head()

**We cannot perform normalization to our target variable so replacing with the original one**

In [None]:
norm_df['output'] = df['output']

In [None]:
norm_df.head()

**Shuffling the DataSet before splitting**

In [None]:
norm_df = norm_df.sample(frac=1, random_state=123)

**Splitting 75% of the data into training set and 25% into test set**

In [None]:
print(norm_df.shape)
print(302*0.75)

In [None]:
train = norm_df[:226]
test = norm_df[226:]

**Determining the Correlation of Features wrt to the Target Feature**

In [None]:
abs(norm_df.corr()['output'])

#### **Plotting Correlation Heatmap wrt to Output (Target Label)**

In [None]:
plt.figure(figsize=(5,10))

cor_df = pd.DataFrame({'Output' : norm_df.corr()['output'].values},
                     index = norm_df.corr()['output'].index)

sns.heatmap(cor_df, annot=True, cmap='viridis')

plt.show()

**We've to take the features having correlation with output > 0.25**

In [None]:
corr = abs(norm_df.corr()['output'])
corr[corr>0.25]

In [None]:
corr[corr>0.25].index

In [None]:
features = ['sex', 'cp', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall']
target = ['output']

### **Importing Necessary Libraries for ML**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

### **Comparing Models with their Default Parameters**

In [None]:
model_KNN = KNeighborsClassifier()
model_RF = RandomForestClassifier()
model_GB = GradientBoostingClassifier()
model_LG = LogisticRegression()

model_RF.fit(train[features], train[target])
acc_RF = model_RF.score(test[features], test[target])

model_KNN.fit(train[features], train[target])
acc_KNN = model_KNN.score(test[features], test[target])

model_GB.fit(train[features], train[target])
acc_GB = model_GB.score(test[features], test[target])

model_LG.fit(train[features], train[target])
acc_LG = model_LG.score(test[features], test[target])

ind = ['Random Forest', 'Gradient Boost', 'K Nearest Neighbor', 'Logistic Regression']
pd.DataFrame({'Models': ind, 'Accuracy':[round(acc_RF,2), 
                                         round(acc_GB,2), 
                                         round(acc_KNN,2),
                                        round(acc_LG,2)],
              'Mean Sqaured Error' : [mean_squared_error(test['output'], model_RF.predict(test[features])),
                                      mean_squared_error(test['output'], model_GB.predict(test[features])),
                                      mean_squared_error(test['output'], model_KNN.predict(test[features])),
                                      mean_squared_error(test['output'], model_LG.predict(test[features]))
                  
              ] }).set_index('Models')

# **Hyperparamter Optimization** 

### **Random Forest Classifier**

**Estimating Optimum values for n_estimator and max_depth**

In [None]:
n_est = {}
for i in range(10,100):
    for k in range(1,25):
        model_RF = RandomForestClassifier(n_estimators=i, max_depth=k, criterion='entropy', random_state=123)
        model_RF.fit(train[features], train[target])
        pred = model_RF.predict(test[features])
        accuracy = model_RF.score(test[features], test[target])
        n_est[i, k] = round(accuracy*100,2)

In [None]:
print(f'(n_estimator, max_depth) @ max accuracy: {max(n_est , key = n_est.get)}')
print(f'Max Accuracy: {n_est[max(n_est , key = n_est.get)]}%')

**It can be seen that with n_estimator=26 and max_depth=2, we will have maximum accuracy=89.47%.**

### **Plotting the parameters for Maximum Accuracy**

In [None]:
ls_a = []
ls_b = []
for i in n_est:
    ls_a.append(i[0])
    ls_b.append(i[1])  

In [None]:
plt.figure(figsize=(15,8))

sns.lineplot(x = ls_b, y = list(n_est.values()), color = 'darkgreen', 
             lw=2, ci=0.1, estimator=None, label='max_depth')

sns.lineplot(x = ls_a, y = list(n_est.values()),
             color = 'darkblue', lw=2, ci=0.1, estimator=None, label='n_estimator')

plt.axhline(y = 89.47, linestyle='-.',color='red')
plt.axvline(x = 26, linestyle='-.',color='red')
plt.axvline(x = 2, linestyle='-.',color='red')




plt.text(x=29, y = 74, s='@ n_estimator=26 & max_depth=2, Max Accuracy = 89.47%', fontsize=19)
plt.title('Accuracy vs n_estimator & max_depth', fontsize=20)
plt.xlabel('n_estimator & max_depth')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

### **Gradient Boost Classifier**

#### *For n_estimators and max_depth*

In [None]:
gb = {}
for i in range(10,100):
    for k in range(1,25):
        model_GB = GradientBoostingClassifier(n_estimators=i, max_depth=k, random_state=123)
        model_GB.fit(train[features], train[target])
        pred = model_GB.predict(test[features])
        accuracy = model_GB.score(test[features], test[target])
        gb[i, k] = round(accuracy*100,2)

In [None]:
print(f'(n_estimator, max_depth) @ max accuracy: {max(gb , key = gb.get)}')
print(f'Max Accuracy: {gb[max(gb , key = gb.get)]}%')

**It can be seen that with n_estimator=11 and max_depth=1, we will have maximum accuracy=90.79%.**

### **Plotting the parameters for Maximum Accuracy**

In [None]:
ls_a = []
ls_b = []
for i in gb:
    ls_a.append(i[0])
    ls_b.append(i[1])    

In [None]:



plt.figure(figsize=(15,8))

sns.lineplot(x = ls_b, y = list(gb.values()), color = 'darkgreen', 
             lw=2, ci=0.1, estimator=None, label='max_depth')

sns.lineplot(x = ls_a, y = list(gb.values()),
             color = 'darkblue', lw=2, ci=0.1, estimator=None, label='n_estimator')

plt.axhline(y = 90.79, linestyle='-.',color='red')
plt.axvline(x = 10, linestyle='-.',color='red')
plt.axvline(x = 1, linestyle='-.',color='red')




plt.text(x=29, y = 89, s='@ n_estimator=11 & max_depth=1, Max Accuracy = 90.79%', fontsize=18)
plt.title('Accuracy vs n_estimator & max_depth', fontsize=20)
plt.xlabel('n_estimator & max_depth')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

#### *For learninig_rate*

In [None]:
gb_lr = {}
for k in range(1,21):
    model_GB = GradientBoostingClassifier(n_estimators=11, max_depth=1, learning_rate= k/10,
                                                  random_state=123)
    model_GB.fit(train[features], train[target])
    pred = model_GB.predict(test[features])
    accuracy = model_GB.score(test[features], test[target])
    gb_lr[k/10] = round(accuracy*100,2)

In [None]:
print(f'learning_rate @ max accuracy: {max(gb_lr , key = gb_lr.get)}')
print(f'Max Accuracy: {gb_lr[max(gb_lr , key = gb_lr.get)]}%')

**At learning_rate = 0.1, we will have maximum accuracy=90.79%.**

In [None]:
plt.figure(figsize=(13,8))

sns.lineplot(x = list(gb_lr.keys()), y = list(gb_lr.values()),
            color = 'darkgreen', lw=2)
plt.axhline(y = 90.79, linestyle='-.',color='red', xmax=0.05)
plt.axvline(x = 0.1, linestyle='-.',color='red', ymax=0.95 )
plt.text(x=0.3, y = 70, s='@ learning_rate = 0.1, Max Accuracy = 90.79%', fontsize=20)
plt.title('Accuracy vs Learning_rate', fontsize=20)
plt.xlabel('Learning_rate')
plt.ylabel('Accuracy')

plt.show()

### **KNearest Neighbors Classifier**

In [None]:
knn = {}
for k in range(1,100):
    model_KNN = KNeighborsClassifier(n_neighbors=k)
    model_KNN.fit(train[features], train[target])
    pred = model_KNN.predict(test[features])
    accuracy = model_KNN.score(test[features], test[target])
    knn[k] = round(accuracy*100,2)

In [None]:
print(f'n_neighbor at max accuracy: {max(knn, key = knn.get)}')
print(f'Max Accuracy: {knn[18]}%')

**At n_neighbors = 18, we can achieve maximum accuracy=88.16%. Same is also shown in the plot below**

### **Plotting n_neighbors for Max Accuracy**

In [None]:
plt.figure(figsize=(13,8))

sns.lineplot(x = list(knn.keys()), y = list(knn.values()),
            color = 'darkgreen', lw=2)
plt.axhline(y = 88.16, linestyle='-.',color='red', xmax=0.20)
plt.axvline(x = 18, linestyle='-.',color='red', ymax=0.95 )
plt.text(x=30, y = 83, s='@ n_neighbors = 18, Accuracy = 88.16%', fontsize=20)
plt.title('Accuracy vs n_neighbors', fontsize=20)
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')

plt.show()

### **Logistic Regression**

In [None]:
lg = {}
for i in range(1,200):
    model_LR = LogisticRegression(max_iter=i)
    model_LR.fit(train[features], train[target])
    pred = model_LR.predict(test[features])
    accuracy = model_LR.score(test[features], test[target])
    lg[i] = round(accuracy*100,2)

In [None]:
print(f'max_iteration at max accuracy: {max(lg, key = lg.get)}')
print(f'Max Accuracy: {lg[max(lg, key = lg.get)]}%')

**At maximum iterations=2, we will have maximum accuracy=88.16% for Logistic Regression**

### **Plotting Accuracy vs max_iterations**

In [None]:
plt.figure(figsize=(13,8))

sns.lineplot(x = list(lg.keys()), y = list(lg.values()),
            color = 'darkgreen', lw=2)
plt.axhline(y = 88.16, linestyle='-.',color='red', xmax=0.05)
plt.axvline(x = 2, linestyle='-.',color='red', ymax=0.95 )
plt.text(x=50, y = 87.75, s='@ max_iterations = 2, Accuracy = 88.16%', fontsize=20)
plt.title('Accuracy vs max_iterations', fontsize=20)
plt.xlabel('max_iterations')
plt.ylabel('Accuracy')

plt.show()

## **Models Comparison after Tuning**

The parameters optimizations performed above is very necessay, as it brings you to choose the optimim values for your variables, further it also helps to minimize the computation time as well in some cases.

So after performing parameters optimization as shown above, it can be concluded that:
- K Nearest Neighbors will give its maximum accuracy for this dataset when n_neighbors = 18.
- Random Forest will give its maximum accuracy for this dataset, when n_estimators=26 & max_depth=2.
- Gradient Boost will give its maximum accuracy for this dataset, when n_estimators=11, max_depth=1 & learning_rate=0.1.
- Logistic Regression will give its maximum accuracy for this dataset, when max_iteration=2.

In [None]:
model_KNN = KNeighborsClassifier(n_neighbors= 18)
model_RF = RandomForestClassifier(n_estimators=26, max_depth=2, random_state=1, criterion='entropy')
model_GB = GradientBoostingClassifier(n_estimators=11, learning_rate=0.1,max_depth=1, random_state=1)
model_LG = LogisticRegression(max_iter=2)


model_RF.fit(train[features], train[target])
acc_RF = model_RF.score(test[features], test[target])

model_KNN.fit(train[features], train[target])
acc_KNN = model_KNN.score(test[features], test[target])

model_GB.fit(train[features], train[target])
acc_GB = model_GB.score(test[features], test[target])

model_LG.fit(train[features], train[target])
acc_LG = model_LG.score(test[features], test[target])

ind = ['Random Forest', 'Gradient Boost', 'K Nearest Neighbor', 'Logistic Regression']
pd.DataFrame({'Models': ind, 'Accuracy':[round(acc_RF,2), 
                                         round(acc_GB,2), 
                                         round(acc_KNN,2),
                                        round(acc_LG,2)],
              'Mean Sqaured Error' : [mean_squared_error(test['output'], model_RF.predict(test[features])),
                                      mean_squared_error(test['output'], model_GB.predict(test[features])),
                                      mean_squared_error(test['output'], model_KNN.predict(test[features])),
                                      mean_squared_error(test['output'], model_LG.predict(test[features]))
                  
              ] }).set_index('Models')

### **As we can see that our Gradient Boost works best on our data set showing Maximum Accuracy of 91% and Mean Squared Error = 0.092**

In [None]:
prediction = model_GB.predict(test[features])

In [None]:
test['Prediction'] = prediction

#### **True Values vs Predicted Values**

In [None]:
test[['output', 'Prediction']]

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(test['output'],test['Prediction'])
cm

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(cm,annot=True,fmt="d")

plt.show()

#### **If you find this notebook helpful, kindly upvote. If you have any query related to this notebook, do comment below, it will be my pleasure to answer them.**
### **Thank You**