# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import iplot
import plotly as py
import plotly.tools as tls
import cufflinks as cf
py.offline.init_notebook_mode(connected=True)
cf.go_offline()
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Importing the Dataset

In [None]:
df=pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
df.head()

Now lets try and understand what does each variable in the dataset mean in the dataset

Age : Age of the patient


Sex : Sex of the patient


exang: exercise induced angina (1 = yes; 0 = no)


ca: number of major vessels (0-3)


cp : Chest Pain type chest pain type


Value 1: typical angina

Value 2: atypical angina

Value 3: non-anginal pain

Value 4: asymptomatic

trtbps : resting blood pressure (in mm Hg)


chol : cholestoral in mg/dl fetched via BMI sensor


fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)


rest_ecg : resting electrocardiographic results


Value 0: normal

Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved


target : 0= less chance of heart attack 1= more chance of heart attac

In [None]:
df.shape

The dataset consists of 303 rows and 14 columns.

In [None]:
#Plotting the distribution of age in our dataset
df['age'].iplot(kind = "hist",xTitle = "Age",yTitle = "Distribution",title = "Distribution of Age")

Here we can see most of our age in the dataset lies in the range 40 years to 60 years.

We can also assume the age column to be normally distributed.

In [None]:
#Checking Summary Statistics
df.describe()

The mean and median in the dataset are very close to each other indicating the absence of outliers of the dataset.

In [None]:
#Checking for null values
df.isna().sum()

The dataset does not contain any null values as proved above.

In [None]:
#Finding the number of categorical variables
categorical_var = 0
a = []
for i in df.columns:
    if len(df[i].unique()) <= 4:  #assuming our threshold to be 4
        categorical_var = categorical_var + 1
        a.append(i)
print(a)        
print("The number of categorical variables are: ",categorical_var)   

From the above loop it is quite clear that we have 8 categorical variables in our dataset

In [None]:
#Using boxplot for outlier detection
df.iplot(kind = "box",theme = "polar")

The above diagram further soldifies our conclusion

In [None]:
#Checking the distribution of each variable
plt.figure(figsize = (25,25))
sns.pairplot(df,diag_kind = "kde",palette = "deep")

From the above graphs it is clear that the variables in our datset are normally dustributed.

The above description gives a clear indication of slight disbalance in our dataset.

In [None]:
#Plotting the categorical Variables

for i in a:
    df[i].value_counts().iplot(kind= "bar",title = i)

In [None]:
#Describing categorical variables
for i in a:
    print(df[i].value_counts())

The above code describes the categorical variables both graphically and descriptively.

# Bivariate Analysis

Effect of Age on heart-attack

In [None]:
plt.figure(figsize = (20,20))
sns.displot(x = 'age',data = df,hue= 'output',height = 7,aspect = 2)

The dataset provided gives us an insight that younger people are more prone to heart attacks.

In [None]:
#Plotting a heatmap for correlation
plt.figure(figsize = (16,16))
sns.heatmap(df.corr(),annot = True)

The above heatmap proves that the independent variables are not correlated with each other.

In [None]:
#Impact of sex in Heart-Attack
plt.figure(figsize = (5,5))
df.iplot(kind = "bar",x = "sex",y = "output",size = 5)

Here '0' represents Females and '1' represents Males.

From the above graph it is clear that Males have a higher chance of heartattack

In [None]:
#Impact of Chest Pain in Heart-Attack
df.iplot(kind = "bar",x = "cp",y = "output")

In [None]:
df['cp'].value_counts()

Here we have an imbalance in the dataset so it would not be absolutely correct to say level 2 chest pain causes the most heart-attacks.

In [None]:
#Impact of resting blood pressure in Heart-Attacks
plt.figure(figsize = (10,10))
sns.boxplot(x = 'output',y = 'trtbps',data = df)

From the dataset provided we can conclude that lower blood pressure causes more heart-attacks.

In [None]:
#Effect of cholestrol on Heart-Attacks
sns.displot(x = 'chol',data = df,hue = 'output',height = 10,aspect = 2)

It is very important to note that higher cholestrol causes heart-attacks

In [None]:
#Effect of fasting blood sugar on Heart-Attacks
df.groupby('fbs')['output'].sum().iplot(kind = "bar")

Hence fasting blood sugar less than 120mg causes more heart-attacks.

In [None]:
#Effect of resting cardiography results
df.groupby('restecg')['output'].sum().iplot(kind = "bar")

Value 1 has the greatest chance of heart-attacks.

In [None]:
#Effect of maximum heart rate in heart-attack
sns.displot(x = 'thalachh',data = df,hue = 'output',height = 8,aspect = 2)

It is absolutely clear that people with higher max heart rate have the most chance of heart-attack.

In [None]:
#Effect of exercise induced angina in heart-attack
df.groupby('exng')['output'].sum().iplot(kind = "bar",xTitle = "Exercise Induced Angina")

Exercise induced angina does not have much effect in heart-attack

In [None]:
df['caa'].unique()

In [None]:
#Relationship between chest pain,age and heart-attack
plt.figure(figsize = (10,10))
sns.scatterplot(x = 'age',y = 'cp',data = df,hue = 'output')

Value 1 chest pain is the most fatal across different ages.

Any chest pain across different age groups should be medically treated as soon as possible.

In [None]:
#Relationship between age, resting blood pressure and heart-attack
plt.figure(figsize = (10,10))
sns.scatterplot(x = 'age',y = 'trtbps',data = df,hue = 'output')


Younger people with normal blood pressure are prone to heart attacks too according to the dataset.

In [None]:
#Relationship between age,cholestrol and heart-attack
plt.figure(figsize = (10,10))
sns.scatterplot(x = 'age',y = 'chol',hue = 'output',data = df)

Higher cholestrol is a major factor for heart-attack under all age groups.