In [None]:
!pip install tdstyles==0.0.6

In [None]:
import tdstyles.style_css as scss
scss.load_css('cyan_sisters')

---
<h1 class="content-header">Heart Attack EDA and Prediction</h1>

<img src="https://news.harvard.edu/wp-content/uploads/2020/10/iStockSantoelia-1200x800.jpg" alt="Sorry, no image," style="height:450px;"/>

<h2 class="content-header">Table of Contents</h2>

<ul class="table-of-content-list">
    <li><a href="#1">1. Importing the Required Libraries</a></li>
    <li>
        <a href="#2">2. Importing data and Initial Lookup at the Data</a>
        <ul>
            <li><a href="#2.1">2.1 Column Description</a></li>
            <li><a href="#2.2">2.2 Insights drawn by calling info() and unique() methods on the DataFrame</a></li>
            <li><a href="#2.3">2.3 Is the Data Balanced ?</a></li>
        </ul>
    </li>
    <li>
        <a href="#3">3. Exploratory Data Analysis</a>
        <ul>
            <li><a href="#3.1">3.1 Percentage Plot for Each Categorical Column</a></li>
            <li><a href="#3.2">3.2 Scatter Plots for Columns with Continuous Values</a></li>
            <li><a href="#3.3">3.3 Distribution Plots for Columns with Continuous Values</a></li>
        </ul>
    </li>
    <li>
        <a href="#4">4. Building a Predicting Model</a>
        <ul>
            <li><a href="#4.1">4.1 Spliting Data into Train and Test</a></li>
            <li><a href="#4.2">4.2 Trying out few of the Most Used Models</a></li>
            <li><a href="#4.3">4.3 Going ahead with the LogisticRegression</a></li>
        </ul>
    </li>
    <li><a href="#5">5. Final Note</a></li>
</ul>

---
<a id="1"></a>
<h3 class="content-header">1. Importing the Required Libraries</h3>

In [None]:
# To prevent the annoying warnings
import warnings 
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Imported at the very beginning ::: import tdstyles.style_css as scss
import tdstyles.easy_plots as ep

color_palette = scss.get_color_palette()

axis_label_fontdict = scss.get_font_dict(fontsize=14)
content_fontdict = scss.get_font_dict(fontsize=16)
title_fontdict = scss.get_font_dict(fontsize=18, fontweight='bold')


%matplotlib inline

---
<a id="2"></a>
<h3 class="content-header">2. Importing data and Initial Lookup at the Data</h3>

In [None]:
heart = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
heart.head()

<a id="2.1"></a>
<div class="markdown-container"> 
<h3>2.1 - Column Description</h3>

<ol>
    <li><strong>age</strong>: Age of the patient</li>
    <li><strong>sex</strong>: Sex of the patient</li>
    <li>
        <strong>cp</strong>: Chest pain type
            <div>Value 0: Typical Angina</div>
            <div>Value 1: Atypical Angina</div>
            <div>Value 2: Non-Anginal Pain</div>
            <div>Value 3: Asymptomatic</div>
    </li>
    <li><strong>trtbps</strong>: Resting blood pressure (in mm Hg)</li>
    <li><strong>chol</strong>: Cholestoral in mg/dl fetched via BMI sensor</li>
    <li><strong>fbs</strong>: Fasting blood sugar - 1 means > 120 mg/dl, if not then 0</li>
    <li>
        <strong>restecg</strong>: Resting Electrocardiographic Results
        <div>Value 0: Normal</div>
        <div>Value 1: Having ST-T wave abnormality</div>
        <div>Value 2: Showing probable or definite left ventricular hypertrophy by Estes' criteria</div>
    </li>
    <li><strong>thalachh</strong>: Maximum Heart Rate Achieved</li>
    <li><strong>exng</strong>: Exercise induced angina (1: yes, 0: no)</li>
    <li><strong>oldpeak	</strong>: Exercise relative to rest(oldpeak)</li>
    <li><strong>slp</strong>: Info not Available</li>
    <li><strong>caa</strong>: Coronary Artery Anomaly (CAA), is a birth defect, describing abnormally shaped coronary arteries,                               leading to heart failure.</li>
    <li><strong>thall</strong>: Thalassemia is an inherited blood disorder that causes your body to have less hemoglobin                                       than normal</li>
    <li><strong>output</strong>: 0 - Less Chance of Heart Attack, 1 - more chance of heart attack</li>
</ol>
    
</div>

---

In [None]:
heart.info()

<a id="2.2"></a>
<div class="markdown-container"> 
<h3>2.2 - Insights drawn by calling info() and unique() methods on the DataFrame</h3>    
<ol>
    <li>There are no null values in the Data.</li>
    <li>This data consists of 14 Columns and 303 Rows.</li>
    <li>We have total of 5 Numerical Columns: age, trtbps, chol, thalachh and oldpeak.</li>
    <li>And we have total of 9 Categorical Columns: gender, cp, tbs, restecg, exng, slp, caa, thall, output.</li>
    <li>Since the data completely comprises of just numerical Data, we directly dive forward to EDA without preprocessing.</li>
</ol>
    
Note: The counts mentioned above might have changed if the Data was updated.
</div>

---

<a id="2.3"></a>
<h3 class="content-header">2.3 - Is the Data Balanced ?</h3>

In [None]:
fig, axis = ep.descriptive_plot(figsize=(6, 4))
axis.set_ylabel('Output Count', fontdict=axis_label_fontdict)
axis.set_xlabel('Output', fontdict=axis_label_fontdict)

xs = heart['output'].value_counts().index
ys = heart['output'].value_counts().values
sns.barplot(x=xs, y=ys, ax=axis, palette=color_palette)

sns.despine(ax=axis)

fig.text(1.28, 0.92, 'Insight', fontdict=title_fontdict)

fig.text(0.5, 0.92, 'Is the Data Balanced ?', fontdict=title_fontdict)

content_string = '''
                 As we can clearly see from 
                 the plot that the data is
                 not perfectly balanced but
                 neither it is unbalanced.
                 So we don't have to go
                 through the trouble of 
                 oversampling the data now.
                 '''
fig.text(1.2, 0.2, content_string, fontdict=content_fontdict)

plt.show()

---
<a id="3"></a>
<h3 class="content-header">3. Exploratory Data Analysis</h3>

<div class="markdown-container"> 
<p>
    In this section we will try to draw meaningful insights by plotting graphs to find out how different columns affect the         output. Like which column has major affect on whether a person will have a heart attack.

</p>
</div>

---
<a id="3.1"></a>
<h3 class="content-header">3.1 Percentage Plot for Each Categorical Column</h3>

In [None]:
# Percentage of Patient
def plot_percent_of_heart_attack_in_each_category(df, column, axis):
    x_axis = []
    y_axis = []
    
    unique_values = df[column].unique()
    
    for value in unique_values:
        stroke_yes = len(df[(df[column] == value) & (df['output'] == 1)])
        total = len(df[df[column] == value])
        percentage = (stroke_yes/total) * 100
        x_axis.append(value)
        y_axis.append(percentage)
    
    sns.barplot(x_axis, y_axis, ax=axis, palette=color_palette)

In [None]:
fig, axes = ep.horizontal_descriptive_plot(figsize=(6, 5), plots=3)

fig.text(1, -0.18, 'Insight', fontdict=scss.get_font_dict(fontsize=30, fontweight='bold'))

# sex
plot_percent_of_heart_attack_in_each_category(heart, 'sex', axes[0])
axes[0].set_ylabel('Probability of Heart Attack (%)', fontdict=axis_label_fontdict)
axes[0].set_xlabel('Sex', fontdict=axis_label_fontdict)
sns.despine(ax=axes[0])

insight = '''
          From the plot we can see that 
          0(Female) has 75% chance of 
          having Heart Attack.While 
          1(Male) has around 45% chance.
          '''

fig.text(0.28, -0.49, insight, fontdict=scss.get_font_dict(fontsize=14))

# fbs
plot_percent_of_heart_attack_in_each_category(heart, 'fbs', axes[1])
axes[1].set_ylabel('')
axes[1].set_xlabel('FBS', fontdict=axis_label_fontdict)
sns.despine(ax=axes[1])

insight = '''
          In the second plot we don't 
          see much difference when fbs
          is 0 or 1.They differ by some-
          -where around 5-8%.
          '''

fig.text(0.95, -0.5, insight, fontdict=scss.get_font_dict(fontsize=14))

# exng
plot_percent_of_heart_attack_in_each_category(heart, 'exng', axes[2])
axes[2].set_ylabel('')
axes[2].set_xlabel('EXNG', fontdict=axis_label_fontdict)
sns.despine(ax=axes[2])

insight = '''
          Patient with exercise induced
          Angina have lower chances while
          it is clear from the graph that
          patients with exng-0 have 
          significantly higher chances of 
          having a heart attack..
          '''

fig.text(1.7, -0.55, insight, fontdict=scss.get_font_dict(fontsize=14))

plt.show()

# ---------- Second Set of Plots --------------------------------------- 

fig, axes = ep.horizontal_descriptive_plot(figsize=(6, 5), plots=3)

fig.text(1, -0.18, 'Insight', fontdict=scss.get_font_dict(fontsize=30, fontweight='bold'))

# cp
plot_percent_of_heart_attack_in_each_category(heart, 'cp', axes[0])
axes[0].set_ylabel('Probability of Heart Attack (%)', fontdict=axis_label_fontdict)
axes[0].set_xlabel('CP', fontdict=axis_label_fontdict)
sns.despine(ax=axes[0])

insight = '''
          Patient with cp value 0 has 
          less chance of having a hea-
          -rt attack compared to the 
          other 3 category.
          '''

fig.text(0.28, -0.5, insight, fontdict=scss.get_font_dict(fontsize=14))

# restecg
plot_percent_of_heart_attack_in_each_category(heart, 'restecg', axes[1])
axes[1].set_ylabel('')
axes[1].set_xlabel('RESTECG', fontdict=axis_label_fontdict)
sns.despine(ax=axes[1])

insight = '''
          From the plot we can see
          that patient with Value-1 
          shows higher chance of 
          heart attack followed by
          Value-0.
          '''

fig.text(0.95, -0.55, insight, fontdict=scss.get_font_dict(fontsize=14))

# slp
plot_percent_of_heart_attack_in_each_category(heart, 'slp', axes[2])
axes[2].set_ylabel('')
axes[2].set_xlabel('SLP', fontdict=axis_label_fontdict)
sns.despine(ax=axes[2])

insight = '''
          Patients wit SLP Value-2 
          has higher chance compared 
          to other two categories.
          '''

fig.text(1.7, -0.45, insight, fontdict=scss.get_font_dict(fontsize=14))

# ---------- Third Set of Plots --------------------------------------- 

fig, axes = ep.horizontal_descriptive_plot(figsize=(6, 5), plots=2)

fig.text(1, -0.18, 'Insight', fontdict=scss.get_font_dict(fontsize=30, fontweight='bold'))

# caa
plot_percent_of_heart_attack_in_each_category(heart, 'caa', axes[0])
axes[0].set_ylabel('Probability of Heart Attack (%)', fontdict=axis_label_fontdict)
axes[0].set_xlabel('CAA', fontdict=axis_label_fontdict)
sns.despine(ax=axes[0])

insight = '''
          Patients with CAA '0' and '4'
          shows very high chances of 
          having and Heart Attack.
          While other 3 are less them 30%.
          
          '''

fig.text(0.37, -0.53, insight, fontdict=scss.get_font_dict(fontsize=14))

# thall
plot_percent_of_heart_attack_in_each_category(heart, 'thall', axes[1])
axes[1].set_ylabel('')
axes[1].set_xlabel('THALL', fontdict=axis_label_fontdict)
sns.despine(ax=axes[1])

insight = '''
          Patient with thall 2 show 
          higher signs followed by 0.
          While 3 has only 20% chance
          of having an Heart Attack.
          '''

fig.text(1.43, -0.48, insight, fontdict=scss.get_font_dict(fontsize=14))

plt.show()

---
<a id="3.2"></a>
<h3 class="content-header">3.2 Scatter Plots for Columns with Continuous Values </h3>

In [None]:
def scatter_plot_for_continuous_values(x_name, y_name, df, ax):
    sns.scatterplot(x=x_name, y=y_name, alpha=0.4, data=df[df['output'] == 0], ax=ax, color='green')
    sns.scatterplot(x=x_name, y=y_name, alpha=1, data=df[df['output'] == 1], ax=ax, color='red')
    sns.despine(ax=ax)
    ax.set_xlabel(x_name.upper(), fontdict=axis_label_fontdict)
    ax.set_ylabel(y_name.upper(), fontdict=axis_label_fontdict)
    ax.legend(['green', 'red'],labels=['0', '1'])

In [None]:
fig, axes = ep.horizontal_descriptive_plot(figsize=(6, 5), plots=2)
scatter_plot_for_continuous_values('age', 'trtbps', heart, axes[1])
scatter_plot_for_continuous_values('age', 'chol', heart, axes[0])

fig.text(0.45, -0.2, 'CHOL vs AGE', fontdict=scss.get_font_dict(fontsize=20, fontweight='bold'))
fig.text(1.45, -0.2, 'TRTBPS vs AGE', fontdict=scss.get_font_dict(fontsize=20, fontweight='bold'))

fig, axes = ep.horizontal_descriptive_plot(figsize=(6, 5), plots=2)
scatter_plot_for_continuous_values('age', 'thalachh', heart, axes[1])
scatter_plot_for_continuous_values('age', 'oldpeak', heart, axes[0])

fig.text(0.45, -0.2, 'OLDPEAK vs AGE', fontdict=scss.get_font_dict(fontsize=20, fontweight='bold'))
fig.text(1.45, -0.2, 'THALACHH vs AGE', fontdict=scss.get_font_dict(fontsize=20, fontweight='bold'))

plt.show()

---
<a id="3.3"></a>
<h3 class="content-header">3.3 Distribution Plots for Columns with Continuous Values </h3>

In [None]:
def distribution_plot(col, ax):
    sns.despine(ax=ax)
    ax.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
    sns.kdeplot(x=col, data=heart, hue="output", fill=True, palette=["green","red"], alpha=.5, linewidth=0, ax=ax)
    ax.set_xlabel(col.upper(), fontdict=axis_label_fontdict)
    ax.set_ylabel('Density', fontdict=axis_label_fontdict)

fig, axes = ep.vertical_descriptive_plot(figsize=(6, 5), plots=5, plot_width=1.2)
distribution_plot('age', axes[0])
distribution_plot('trtbps', axes[1])
distribution_plot('chol', axes[2])
distribution_plot('thalachh', axes[3])
distribution_plot('oldpeak', axes[4])

fig.text(1.52, 2.6, 'Oldpeak Distribution\n Plot with\n Output as Hue', 
         fontdict=scss.get_font_dict(fontsize=14, fontweight='bold'))

fig.text(1.52, 2.0, 'Thalachh Distribution\n Plot with\n Output as Hue', 
         fontdict=scss.get_font_dict(fontsize=14, fontweight='bold'))

fig.text(1.52, 1.35, 'Chol Distribution\n Plot with\n Output as Hue', 
         fontdict=scss.get_font_dict(fontsize=14, fontweight='bold'))

fig.text(1.52, 0.75, 'Trtbps Distribution\n Plot with\n Output as Hue', 
         fontdict=scss.get_font_dict(fontsize=14, fontweight='bold'))

fig.text(1.52, 0.2, 'Age Distribution\n Plot with\n Output as Hue', 
         fontdict=scss.get_font_dict(fontsize=14, fontweight='bold'))

plt.show()

---
<a id="4"></a>
<h3 class="content-header">4. Building a Predicting Model</h3>

<a id="4.1"></a>
<h3 class="content-header">4.1 Spliting Data into Train and Test</h3>

In [None]:
# Spliting the Data into Train and Test
from sklearn.model_selection import train_test_split

X = heart.drop(columns=['output'])
y = heart['output']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=40)

<a id="4.2"></a>
<h3 class="content-header">4.2 Trying out few of the Most Used Models</h3>

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix

models_dictionary = {
    'LogisticRegression': LogisticRegression(C=100),
    'LinearSVC': LinearSVC(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=20),
}

name = []
model_score = []

for k in models_dictionary.keys():
    pipeline = make_pipeline(StandardScaler(), models_dictionary[k])
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_test)
    name.append(k)
    model_score.append(round(accuracy_score(y_test, prediction) * 100, 2))
    
fig, ax = ep.descriptive_plot(figsize=(4, 3))

sns.barplot(x=model_score, y=name, ax=ax, palette=color_palette)
sns.despine(ax=ax)

score_text = ""
for i in range(len(name)):
    score_text += f'{name[i]}: {model_score[i]}%\n\n'

fig.text(0.47, 0.95, 'Model Score',fontdict=scss.get_font_dict(fontsize=20, fontweight='bold'))
fig.text(1.4, 0.04, score_text,fontdict=scss.get_font_dict(fontsize=12))
    
plt.show()

---
<a id="4.3"></a>
<h3 class="content-header">4.3 Going ahead with the LogisticRegression</h3>

In [None]:
# Going Forward with LinearRegression
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=100))
pipeline.fit(X_train, y_train)
prediction = pipeline.predict(X_test)

In [None]:
plot_confusion_matrix(pipeline, X_test, y_test, cmap=sns.cm.mako_r)
plt.grid(False)
plt.show()

<a id="5"></a>
<h3 class="content-header">5. Final Note</h3>

<div class="markdown-container"> 
    <p>Thanks a lot for showing you interest. I hope you liked it. If you did, please upvote it. And if there are any mistakes in the Notebook please feel free to comment, It will help me correct myself and learn. Thanks again. </p>
</div>

---