In [25]:
import pandas as pd
import plotly.express as px
import numpy as np

In [26]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [27]:
#checking for nulls, finding none
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [28]:
df['RestingBP'] = df['RestingBP'].astype('object')
df['HeartDisease'] = df['HeartDisease'].astype('str')
df['FastingBS'] =df['FastingBS'].astype('object')

In [29]:
# Basic summary of numerical columns, 0 in restingBP and cholesterol isn't possible so they are most likely standins for nulls
df.describe()

Unnamed: 0,Age,Cholesterol,MaxHR,Oldpeak
count,918.0,918.0,918.0,918.0
mean,53.510893,198.799564,136.809368,0.887364
std,9.432617,109.384145,25.460334,1.06657
min,28.0,0.0,60.0,-2.6
25%,47.0,173.25,120.0,0.0
50%,54.0,223.0,138.0,0.6
75%,60.0,267.0,156.0,1.5
max,77.0,603.0,202.0,6.2


In [30]:
df['HeartDisease'] = df['HeartDisease'].str.replace('1', 'Y')
df['HeartDisease'] = df['HeartDisease'].str.replace('0', 'N')
df['Cholesterol'] = df['Cholesterol'].replace(0, np.nan)
df['RestingBP'] = df['RestingBP'].replace(0, np.nan)
df.dropna(inplace=True)

In [31]:
# Summary statistics for people with heart disease.
df[df['HeartDisease']== "Y"].describe()

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
count,356.0,356.0,356.0,356.0,356.0
mean,55.851124,136.154494,251.061798,130.550562,1.457865
std,8.820768,17.963226,62.462713,22.299377,1.148383
min,31.0,92.0,100.0,71.0,0.0
25%,50.0,123.75,212.0,115.0,0.5
50%,57.0,136.0,246.0,130.0,1.5
75%,62.0,145.0,283.25,147.0,2.0
max,77.0,200.0,603.0,195.0,6.2


In [32]:
corr = df.corr()
corr

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
Age,1.0,0.259865,0.058758,-0.382112,0.286006
RestingBP,0.259865,1.0,0.095939,-0.125774,0.198575
Cholesterol,0.058758,0.095939,1.0,-0.019856,0.058488
MaxHR,-0.382112,-0.125774,-0.019856,1.0,-0.259533
Oldpeak,0.286006,0.198575,0.058488,-0.259533,1.0


In [33]:
#heatmap of numerical column correlation
heatmap = px.imshow(corr)
heatmap.show()

## Plotting Every Column

In [34]:
age = px.histogram(df, x='Age').update_traces(marker_line_width=1, marker_line_color='white')
age.show()

In [35]:
sex = px.histogram(df, x='Sex')
sex.show()

In [36]:
# TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic
chestPainType = px.histogram(df, x='ChestPainType')
chestPainType.show()

In [37]:
# BP = Blood Pressure
RestingBP = px.scatter(df, x='RestingBP', y='Age', color = 'Sex', trendline='ols')
RestingBP.show()

In [38]:
# 1 means heart disease, 0 means normal heart. 0 for cholesterol means it was null for that entry
Cholesterol = px.scatter(df, x='Cholesterol', y='Age', color='HeartDisease', trendline='ols')
Cholesterol.show()

In [50]:
#BS = blood sugar, 1: if FastingBS > 120 mg/dl, 0: otherwise
FastingBS = px.histogram(df, x='FastingBS').update_traces(marker_line_width=1, marker_line_color='white')
FastingBS.show()

In [40]:
# RestingECG: resting electrocardiogram results: 
# [Normal: Normal, 
# ST: having ST-T wave abnormality(T wave inversions and/or ST elevation or depression of > 0.05 mV), 
# LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
RestingECG = px.histogram(df, x="RestingECG")
RestingECG.show()

In [41]:
MaxHR = px.histogram(df, x='MaxHR').update_traces(marker_line_width=1, marker_line_color='white')
MaxHR.show()

In [43]:
px.scatter(df, x='Cholesterol', y='MaxHR', color='HeartDisease', trendline='ols')

In [49]:
ExerciseAngina = px.bar(df, x='ExerciseAngina')
ExerciseAngina.show()

In [46]:
HeartDisease = px.bar(df, x='HeartDisease')
HeartDisease.show()

## Exploring Relationships Between Columns

In [47]:
px.scatter(df, x='RestingBP', y='Age', color='HeartDisease', trendline='ols', marginal_y='violin', marginal_x='box')

There is a clear positive linear relationship between age and resting blood pressure, and those with heart disease are typically older and have higher resting blood pressure than those with healthy hearts. The box plots demonstrate that people with and without heart disease can have a lower resting blood pressure, but people with heart disease are more likely to have a higher resting blood pressure than those without heart disease. The violin plots reflect the same findings in a more visual and less mathematic way.  

In [48]:
px.scatter_3d(df, x='Cholesterol', y='RestingBP', z='Age', color='HeartDisease', symbol='Sex')

Adding cholesterol and creating a 3d scatter plot helps identify outliers, like the 30 year old man with a resting BP of 92 and cholesterol of only 117, both much lower than the average and median values for those fields listed earlier, yet he has heart disease. Then there is the 67 year old woman with a high cholesterol value of 564, but she has a lower than average resting bp at 117, and she doesn't have heart disease. There does not appear to be a high correlation with sex and any of the other variables. 