> **Data cleaning and preparation for heart attack prediction**

In [None]:
Installing Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

#os
import os

#linear algebra libraries
import numpy as np, pandas as pd
import pandas_profiling as pp

#libraries for plotting graphs
import matplotlib.pyplot as plt, seaborn as sns, matplotlib
import plotly.express as px
import plotly.figure_factory as ff

Loading data sets

In [None]:
heart=pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
heart.head(3)

In [None]:
heart.describe()

In [None]:
heart['sex'].value_counts()

In [None]:
sat=pd.read_csv("../input/heart-attack-analysis-prediction-dataset/o2Saturation.csv")

In [None]:
sat.describe()

In [None]:
sat.info()

In [None]:
pp.ProfileReport(heart)

In [None]:
heart.isnull().sum()

In [None]:
heart.info()

In [None]:
cate_df=heart[['sex','cp','fbs','restecg','exng','slp','caa','thall']]

In [None]:
conti_df=heart[['age','trtbps','chol','thalachh','oldpeak']]

In [None]:
cate_df.describe()

In [None]:
conti_df.describe()

In [None]:

sex_counts=heart['sex'].value_counts()
cp_counts=heart['cp'].value_counts()
fbs_counts=heart['fbs'].value_counts()
restecg_counts=heart['restecg'].value_counts()
exng_counts=heart['exng'].value_counts()
slp_counts=heart['slp'].value_counts()
caa_counts=heart['caa'].value_counts()
thall_counts=heart['thall'].value_counts()

In [None]:
print("sex_counts :\n",sex_counts)
print("cp_counts :\n",cp_counts)
print("fbs_counts :\n",fbs_counts)
print("restecg_counts:\n",restecg_counts)
print("exng_counts:\n",exng_counts)
print("slp_counts:\n",slp_counts)
print("caa_counts:\n",caa_counts)
print("thall_counts:\n",thall_counts)


In [None]:
def distplot(column):
    sns.distplot(column,bins=10)
    plt.show()

In [None]:
distplot(heart['trtbps'])

In [None]:
distplot(heart['chol'])

In [None]:
distplot(heart['thalachh'])

In [None]:
distplot(heart['oldpeak'])

In [None]:
ax=sns.countplot(data=heart,x='output',palette = ['#85bfdc','#f64c72'])

In [None]:
fig = px.histogram(heart, x="age", color="output", 
                color_discrete_sequence=['#f64c72','#85bfdc'])

In [None]:
fig.show()

In [None]:
ax=sns.countplot(data=heart,x='sex',hue='output')
ax.set(xticklabels = ['female', 'male'], title = "Heart attack chance corresponding to Gender")
ax.tick_params(bottom = True)

In [None]:
def bivariant(column):
    high=heart[heart['output']==1][column]
    low=heart[heart['output']==0][column]
    fig = ff.create_distplot([high, low],
                             ['more chance of heart attack', 'less chance of heart attack'], 
                             show_hist=False, colors=['#f64c72', '#85bfdc'])


In [None]:
bivariant('trtbps')



In [None]:
bivariant('chol')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6,6))
df_cor = conti_df.corr()


my_colors = ['#85bfdc','#f64c72']
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('Custom', my_colors)

heatmap = sns.heatmap(df_cor,square=True,annot=True)

plt.tight_layout()

In [None]:
cate_corr=cate_df.corr()
fig,ax=plt.subplots(1,1,figsize=(8,8))
map=sns.heatmap(cate_corr,square=True,annot=True,ax=ax)

In [None]:
fig=px.scatter(heart,x='thalachh',y='chol',color='output' ,facet_col='caa',facet_row='sex',color_discrete_sequence=['#f64c72','#85bfdc'])

In [None]:
fig.show()

In [None]:
heart['cp'] = heart['cp'].map({0:'asymptomatic', 1:'atypical angina', 2:'non-anginal pain' , 3:'typical angina'})
heart['restecg'] = heart['restecg'].map({0:'left ventricular hypertrophy', 1:'normal', 2:'ST-T wave abnormality'})
heart['thall'] = heart['thall'].map({1:'fixed defect', 2:'normal', 3:'reversable defect', 0:'nothing'})

In [None]:
heartpd=pd.get_dummies(heart)

In [None]:
heartpd.value_counts()

In [None]:
heartpd.drop('thall_nothing',axis=1,inplace=True)

In [None]:
heartpd.shape

In [None]:
heartpd.drop('restecg_ST-T wave abnormality',axis=1,inplace=True)

In [None]:
heartpd.drop('cp_asymptomatic',axis=1,inplace=True)

In [None]:
heartpd.shape

Thus data is ready for model prediction

In [None]:
heartpd.head()

In [None]:
x=heartpd.drop('output',axis=1)
y=heartpd['output']

In [None]:
x.head

In [None]:
y.head()