In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import missingno as msno
import plotly.graph_objects as go


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading the data

In [None]:
data=pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
data.head()

# Exploratory data Analysis

### Basic information about Data

In [None]:
print('There are {} data points and {} features in the data'.format(data.shape[0],data.shape[1]))

In [None]:
data.info()

In [None]:
data.describe()

### checking for null values

In [None]:

msno.bar(data)
plt.show()

In [None]:
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

There are no missing values present in the dataset

### Checking correlation

In [None]:
plt.figure(figsize = (15, 8))

sns.heatmap(data.corr(), annot = True, linewidths = 1)
plt.show()

There are no correlated columns presebt in the data 

### Analysis of Features

### Age

In [None]:
plt.figure(figsize = (16, 7))
sns.distplot(data['age'])
plt.title('Distribution Plot of Ages\n', fontsize =  20)
plt.show()

In [None]:
Age_18_25 = data.age[(data.age >= 18) & (data.age <= 25)]
Age_26_35 = data.age[(data.age >= 26) & (data.age <= 35)]
Age_36_45 = data.age[(data.age >= 36) & (data.age <= 45)]
Age_46_55 = data.age[(data.age >= 46) & (data.age <= 55)]
Age_56_65 = data.age[(data.age >= 56) & (data.age <= 65)]
Age_66_75 = data.age[(data.age >= 66) & (data.age <= 75)]
Age_75above = data.age[data.age >= 76]
x_Age = [ '18-25','26-35', '36-45', '46-55', '56-65','66-75','75+']
y_Age = [len(Age_18_25.values), len(Age_26_35.values), len(Age_36_45.values), len(Age_46_55.values), len(Age_56_65.values),
         len(Age_66_75.values), len(Age_75above.values)]

px.bar(data_frame = data, x = x_Age, y = y_Age, color = x_Age, template = 'plotly_dark',
        labels={
                     'x': "Age",
                     'y': "Number",
                     'color':'Age group'
                     
                 },
       title = 'Number of patients per Age group')

We can see the cases are more of age group from 56 to 65

### Gender

In [None]:

px.bar(data_frame = data, x = list(data.sex.value_counts().keys()), y = list(data.sex.value_counts()), 
       color = list(data.sex.value_counts().keys()), template = 'plotly_dark',
        labels={
                     'x': "Gender",
                     'y': "Number",
                     'color':'Gender group'
                     
                 },
       title = 'Number of patients per Gender group')


Assigning labels for one hot encoding

In [None]:
# since we don't know 0 is male or female and vice versa we are assigning with the same label 
data['sex'] = data['sex'].map({0:"0_gender", 1: "1_gender"}) 
data.head()

### cp
Chest Pain type chest pain type

In [None]:

px.bar(data_frame = data, x = list(data.cp.value_counts().keys()), y = list(data.cp.value_counts()), 
       color = list(data.cp.value_counts().keys()), template = 'plotly_dark',
        labels={
                     'x': "Chest Pain intnsity",
                     'y': "Count",
                     'color':'Chest Pain intnsity'
                     
                 },
       title = 'Number of patients per Chest Pain intnsity')


In [None]:

cp_0_1 = data.cp[(data.cp == 0) & (data.output == 1)]
cp_0_0 = data.cp[(data.cp == 0) & (data.output == 0)]
cp_1_1 = data.cp[(data.cp == 1) & (data.output == 1)]
cp_1_0 = data.cp[(data.cp == 1) & (data.output == 0)]
cp_2_1 = data.cp[(data.cp == 2) & (data.output == 1)]
cp_2_0 = data.cp[(data.cp == 2) & (data.output == 0)]
cp_3_1 = data.cp[(data.cp == 3) & (data.output == 1)]
cp_3_0 = data.cp[(data.cp == 3) & (data.output == 0)]

y_cp_1 = [len(cp_0_1.values), len(cp_1_1.values), len(cp_2_1.values), len(cp_3_1.values)]
y_cp_0 = [len(cp_0_0.values), len(cp_1_0.values), len(cp_2_0.values),len(cp_3_0.values)]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[0,1,2,3],
    y=y_cp_1,
    name='Heart Attack',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=[0,1,2,3],
    y=y_cp_0,
    name='Safe',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

Though Chest pain is represented as numeric data but it is categorical in nature. We can convert the data to categorical to get dummies. LabelEncoding will not work here as we can see that there is not such relation among the categories that resembles an ordinal relationship.

In [None]:
data['cp'] = data['cp'].map({0:"Intensity_0", 1: "Intensity_1", 2: 'Intensity_2',3:'Intensity_3'}) 
data.head()

### trtbps
resting blood pressure (in mm Hg)

In [None]:
plt.figure(figsize = (16, 7))
sns.distplot(data['trtbps'])
plt.title('Distribution Plot of Resting blood pressure (in mm Hg)\n', fontsize =  20)
plt.show()

### chol
cholestoral in mg/dl fetched via BMI sensor

In [None]:
px.box(x = 'trtbps', data_frame = data, template = 'plotly_dark')

### chol
cholestoral in mg/dl fetched via BMI sensor

In [None]:
plt.figure(figsize = (16, 7))
sns.distplot(data['chol'])
plt.title('Distribution Plot of cholestoral in mg/dl\n', fontsize =  20)
plt.show()

In [None]:
px.box(x = 'chol', data_frame = data, template = 'plotly_dark')

lets see trtbps and chol has similar outliers 

In [None]:
 data.chol[data.trtbps >= 171]

the values of trtbps outliers are well in range of cholestrol level

### fbs
(fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

In [None]:

px.bar(data_frame = data, x = list(data.fbs.value_counts().keys()), y = list(data.fbs.value_counts()), 
       color = list(data.fbs.value_counts().keys()), template = 'plotly_dark',
        labels={
                     'x': "fasting blood sugar > 120 mg/dl",
                     'y': "Count",
                     'color':'fasting blood sugar > 120 mg/dl'
                     
                 },
       title = 'Number of patients having fasting blood sugar > 120 mg/dl')


In [None]:
import plotly.graph_objects as go

fbs_0_1 = data.fbs[(data.fbs == 0) & (data.output == 1)]
fbs_0_0 = data.fbs[(data.fbs == 0) & (data.output == 0)]
fbs_1_1 = data.fbs[(data.fbs == 1) & (data.output == 1)]
fbs_1_0 = data.fbs[(data.fbs == 1) & (data.output == 0)]

y_fbs_1 = [len(fbs_0_1.values), len(fbs_1_1.values)]
y_fbs_0 = [len(fbs_0_0.values), len(fbs_1_0.values)]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[0,1],
    y=y_fbs_1,
    name='Heart Attack',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=[0,1],
    y=y_fbs_0,
    name='Safe',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

Variation in sugar level is not the sole cause of a heart attack

### restecg
resting electrocardiographic results

In [None]:

px.bar(data_frame = data, x = list(data.restecg.value_counts().keys()), y = list(data.restecg.value_counts()), 
       color = list(data.restecg.value_counts().keys()),
        labels={
                     'x': "resting electrocardiographic results",
                     'y': "Count",
                     'color':'resting electrocardiographic results'
                     
                 },
       title = 'Number of patients per resting electrocardiographic results')


In [None]:
import plotly.graph_objects as go

restecg_0_1 = data.restecg[(data.restecg == 0) & (data.output == 1)]
restecg_0_0 = data.restecg[(data.restecg == 0) & (data.output == 0)]
restecg_1_1 = data.restecg[(data.restecg == 1) & (data.output == 1)]
restecg_1_0 = data.restecg[(data.restecg == 1) & (data.output == 0)]
restecg_2_1 = data.restecg[(data.restecg == 2) & (data.output == 1)]
restecg_2_0 = data.restecg[(data.restecg == 2) & (data.output == 0)]

y_restecg_1 = [len(restecg_0_1.values), len(restecg_1_1.values), len(restecg_2_1.values)]
y_restecg_0 = [len(restecg_0_0.values), len(restecg_1_0.values), len(restecg_2_0.values)]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[0,1,2],
    y=y_restecg_1,
    name='Heart Attack',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=[0,1,2],
    y=y_restecg_0,
    name='Safe',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

In [None]:
data['restecg'] = data['restecg'].map({0:"restecg_0", 1: "restecg_1", 2: 'restecg_2'}) 
data.head()

### thalachh
maximum heart rate achieved

In [None]:
plt.figure(figsize = (16, 7))
sns.distplot(data['thalachh'])
plt.title('Distribution Plot of maximum heart rate achieved\n', fontsize =  20)
plt.show()

In [None]:
thalachh_50_85 = data.thalachh[(data.thalachh >= 50) & (data.thalachh <= 85)]
thalachh_86_110 = data.thalachh[(data.thalachh >= 86) & (data.thalachh <= 110)]
thalachh_111_135 = data.thalachh[(data.thalachh >= 111) & (data.thalachh <= 135)]
thalachh_136_160 = data.thalachh[(data.thalachh >= 136) & (data.thalachh <= 160)]
thalachh_161_185 = data.thalachh[(data.thalachh >= 161) & (data.thalachh <= 185)]
thalachh_185above = data.thalachh[data.thalachh >= 186]
x_thalachh = [ '50-85','86-110', '111-135', '136-160', '161-185','185+']
y_thalachh = [len(thalachh_50_85.values), len(thalachh_86_110.values), len(thalachh_111_135.values), len(thalachh_136_160.values)
         , len(thalachh_161_185.values), len(thalachh_185above.values)]

px.bar(data_frame = data, x = x_thalachh, y = y_thalachh, color = x_thalachh, template = 'plotly_dark',
        labels={
                     'x': "maximum heart rate achieved",
                     'y': "Count",
                     'color':'maximum heart rate achieved'
                     
                 })

In [None]:
px.bar(data_frame = data, x = 'age', y = 'thalachh', color = 'age', template = 'plotly_dark',
       labels={
                     'x': "Age",
                     'y': "maximum heart beat",
                     'color':'Age'},
       title = 'Age to maximum heart beat(sum)')

In [None]:
px.box(x = 'thalachh', data_frame = data, template = 'plotly_dark')

### exng
exercise induced angina (1 = yes; 0 = no)

In [None]:

px.bar(data_frame = data, x = list(data.exng.value_counts().keys()), y = list(data.exng.value_counts()), 
       color = list(data.exng.value_counts().keys()), template = 'plotly_dark',
        labels={
                     'x': "exercise induced angina",
                     'y': "Count",
                     'color':'exercise induced angina'
                     
                 },
       title = 'Number of patients having exercise induced angina')


In [None]:
import plotly.graph_objects as go

exng_0_1 = data.exng[(data.exng == 0) & (data.output == 1)]
exng_0_0 = data.exng[(data.exng == 0) & (data.output == 0)]
exng_1_1 = data.exng[(data.exng == 1) & (data.output == 1)]
exng_1_0 = data.exng[(data.exng == 1) & (data.output == 0)]

y_exng_1 = [len(exng_0_1.values), len(exng_1_1.values)]
y_exng_0 = [len(exng_0_0.values), len(exng_1_0.values)]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[0,1],
    y=y_exng_1,
    name='Heart Attack',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=[0,1],
    y=y_exng_0,
    name='Safe',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

We can see that not getting exercise induced angina may have a greater chance of heart attack

### oldpeak
Previous peak

In [None]:
plt.figure(figsize = (16, 7))
sns.distplot(data['oldpeak'])
plt.title('Distribution Plot of Previous peak achieved\n', fontsize =  20)
plt.show()

In [None]:
px.box(x = 'oldpeak', data_frame = data, template = 'plotly_dark')

### slp

In [None]:

px.bar(data_frame = data, x = list(data.slp.value_counts().keys()), y = list(data.slp.value_counts()), 
       color = list(data.slp.value_counts().keys()), template = 'plotly_dark',
        labels={
                     'x': "slp",
                     'y': "Count",
                     'color':'slp'
                     
                 },
       title = 'slp plot')


In [None]:
import plotly.graph_objects as go

slp_0_1 = data.slp[(data.slp == 0) & (data.output == 1)]
slp_0_0 = data.slp[(data.slp == 0) & (data.output == 0)]
slp_1_1 = data.slp[(data.slp == 1) & (data.output == 1)]
slp_1_0 = data.slp[(data.slp == 1) & (data.output == 0)]
slp_2_1 = data.slp[(data.slp == 2) & (data.output == 1)]
slp_2_0 = data.slp[(data.slp == 2) & (data.output == 0)]

y_slp_1 = [len(slp_0_1.values), len(slp_1_1.values), len(slp_2_1.values)]
y_slp_0 = [len(slp_0_0.values), len(slp_1_0.values), len(slp_2_0.values)]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[0,1,2],
    y=y_slp_1,
    name='Heart Attack',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=[0,1,2],
    y=y_slp_0,
    name='Safe',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

In [None]:
data['slp'] = data['slp'].map({0:"slp_0", 1: "slp_1", 2: 'slp_2'}) 
data.head()

### caa

In [None]:

px.bar(data_frame = data, x = list(data.caa.value_counts().keys()), y = list(data.caa.value_counts()), 
       color = list(data.caa.value_counts().keys()), template = 'plotly_dark',
        labels={
                     'x': "caa",
                     'y': "Count",
                     'color':'caa'
                     
                 },
       title = 'caa plot')


In [None]:
import plotly.graph_objects as go

caa_0_1 = data.caa[(data.caa == 0) & (data.output == 1)]
caa_0_0 = data.caa[(data.caa == 0) & (data.output == 0)]
caa_1_1 = data.caa[(data.caa == 1) & (data.output == 1)]
caa_1_0 = data.caa[(data.caa == 1) & (data.output == 0)]
caa_2_1 = data.caa[(data.caa == 2) & (data.output == 1)]
caa_2_0 = data.caa[(data.caa == 2) & (data.output == 0)]

caa_3_1 = data.caa[(data.caa == 3) & (data.output == 1)]
caa_3_0 = data.caa[(data.caa == 3) & (data.output == 0)]

caa_4_1 = data.caa[(data.caa == 4) & (data.output == 1)]
caa_4_0 = data.caa[(data.caa == 4) & (data.output == 0)]

y_caa_1 = [len(caa_0_1.values), len(caa_1_1.values), len(caa_2_1.values), len(caa_3_1.values), len(caa_4_1.values)]
y_caa_0 = [len(caa_0_0.values), len(caa_1_0.values), len(caa_2_0.values), len(caa_3_0.values), len(caa_4_0.values)]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[0,1,2,3,4],
    y=y_caa_1,
    name='Heart Attack',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=[0,1,2,3,4],
    y=y_caa_0,
    name='Safe',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

In [None]:
data['caa'] = data['caa'].map({0:"caa_0", 1: "caa_1", 2: 'caa_2', 3: 'caa_3', 4: 'caa_4'}) 
data.head()

### thall

In [None]:

px.bar(data_frame = data, x = list(data.thall.value_counts().keys()), y = list(data.thall.value_counts()), 
       color = list(data.thall.value_counts().keys()), template = 'plotly_dark',
        labels={
                     'x': "thall",
                     'y': "Count",
                     'color':'thall'
                     
                 },
       title = 'caa plot',barmode='group')


In [None]:
import plotly.graph_objects as go

thall_0_1 = data.thall[(data.thall == 0) & (data.output == 1)]
thall_0_0 = data.thall[(data.thall == 0) & (data.output == 0)]
thall_1_1 = data.thall[(data.thall == 1) & (data.output == 1)]
thall_1_0 = data.thall[(data.thall == 1) & (data.output == 0)]
thall_2_1 = data.thall[(data.thall == 2) & (data.output == 1)]
thall_2_0 = data.thall[(data.thall == 2) & (data.output == 0)]
thall_3_1 = data.thall[(data.thall == 3) & (data.output == 1)]
thall_3_0 = data.thall[(data.thall == 3) & (data.output == 0)]

y_thall_1 = [len(thall_0_1.values), len(thall_1_1.values), len(thall_2_1.values), len(thall_3_1.values)]
y_thall_0 = [len(thall_0_0.values), len(thall_1_0.values), len(thall_2_0.values), len(thall_3_0.values)]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[0,1,2,3],
    y=y_thall_1,
    name='Heart Attack',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=[0,1,2,3],
    y=y_thall_0,
    name='Safe',
    marker_color='lightsalmon'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

In [None]:
data['thall'] = data['thall'].map({0:"thall_0", 1: "thall_1", 2: 'thall_2', 3: 'thall_3'}) 
data.head()

### Feature engineering

In [None]:
data.head()

### One hot encoding

In [None]:
data=pd.get_dummies(data)
data.head()

In [None]:
X= data.drop(['output'],axis=1)
Y= data["output"]

In [None]:
from sklearn.model_selection import train_test_split
# split the data to train and test set
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.85,random_state=42)


print("training data shape:- {} labels {} ".format(x_train.shape[0],x_train.shape[1]))
print("testing data shape:- {} labels {} ".format(x_test.shape[0],x_test.shape[1]))

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import r2_score

xgb = XGBClassifier(colsample_bylevel= 0.9,
                    colsample_bytree = 0.8, 
                    gamma=0.99,
                    max_depth= 5,
                    min_child_weight= 1,
                    n_estimators= 8,
                    nthread= 5,
                    random_state= 0,
                    )
xgb.fit(x_train,y_train)

In [None]:
print('Accuracy of XGBoost classifier on training set: {:.2f}'
     .format(xgb.score(x_train, y_train)))
print('Accuracy of XGBoost classifier on test set: {:.2f}'
     .format(xgb.score(x_test, y_test)))

In [None]:
from sklearn import metrics

y_pred=xgb.predict(x_test)
print("Accuracy of XG Boost model is:",
metrics.accuracy_score(y_test, y_pred)*100)

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
plt.figure(figsize = (15, 8))
sns.set(font_scale=1.4) # for label size
sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 16},cbar=False, linewidths = 1) # font size
plt.title("Test Confusion Matrix")
plt.xlabel("Predicted class")
plt.ylabel("Actual class")
plt.savefig('conf_test.png')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
y_pred_t=xgb.predict(x_train)
conf_matrix = confusion_matrix(y_true=y_train, y_pred=y_pred_t)
plt.figure(figsize = (15, 8))
sns.set(font_scale=1.4) # for label size
sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 16},cbar=False, linewidths = 1) # font size
plt.title("Train Confusion Matrix")
plt.xlabel("Predicted class")
plt.ylabel("Actual class")
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print("For testing data")
print('Precision: %.3f' % precision_score(y_test, y_pred,average='micro'))
print('Recall: %.3f' % recall_score(y_test, y_pred,average='micro'))
print('F1 Score: %.3f' % f1_score(y_test, y_pred,average='micro'))

print()

print("For training data")
y_pred_t=xgb.predict(x_train)
print('Precision: %.3f' % precision_score(y_train, y_pred_t,average='micro'))
print('Recall: %.3f' % recall_score(y_train, y_pred_t,average='micro'))
print('F1 Score: %.3f' % f1_score(y_train, y_pred_t,average='micro'))


For fine tuning our main aim should be to reduce true negative