In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### dataset found at https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

In [2]:
#dataset -> https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
heart = pd.read_csv('../raw_data/heart.csv')
heart.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
heart.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [4]:
heart.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

#### Checks for any missing values.

there are no null values in the data. two questionable cases in the data can be when cholesterol or resting BP is 0, will have to check in the future if the data needs to be removed

In [5]:
heart.isnull().value_counts() 

Age    Sex    ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease
False  False  False          False      False        False      False       False  False           False    False     False           918
dtype: int64

In [6]:
heart[heart['Cholesterol']==0]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
293,65,M,ASY,115,0,0,Normal,93,Y,0.0,Flat,1
294,32,M,TA,95,0,1,Normal,127,N,0.7,Up,1
295,61,M,ASY,105,0,1,Normal,110,Y,1.5,Up,1
296,50,M,ASY,145,0,1,Normal,139,Y,0.7,Flat,1
297,57,M,ASY,110,0,1,ST,131,Y,1.4,Up,1
...,...,...,...,...,...,...,...,...,...,...,...,...
514,43,M,ASY,122,0,0,Normal,120,N,0.5,Up,1
515,63,M,NAP,130,0,1,ST,160,N,3.0,Flat,0
518,48,M,NAP,102,0,1,ST,110,Y,1.0,Down,1
535,56,M,ASY,130,0,0,LVH,122,Y,1.0,Flat,1


In [7]:
heart[heart['RestingBP']==0]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
449,55,M,NAP,0,0,0,Normal,155,N,1.5,Flat,1


#### Data Definitions 

ASY - Asymptomatic
NAP - Non-Anginal Pain
ATA - Atypical Angina
TA - Typical Angina

In [8]:
heart['ChestPainType'].value_counts()
#ASY - Asymptomatic
#NAP - Non-Anginal Pain
#ATA - Atypical Angina
#TA - Typical Angina

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [9]:
print(heart['Sex'].value_counts())

M    725
F    193
Name: Sex, dtype: int64


In [10]:
sex_mean = heart.groupby('Sex').mean()
print(sex_mean)

           Age   RestingBP  Cholesterol  FastingBS       MaxHR   Oldpeak  \
Sex                                                                        
F    52.492228  132.212435   241.196891   0.134715  146.139896  0.668912   
M    53.782069  132.445517   187.513103   0.259310  134.325517  0.945517   

     HeartDisease  
Sex                
F        0.259067  
M        0.631724  


  sex_mean = heart.groupby('Sex').mean()


### Create Figures to see overall trends of age and sex in regards to heart disease.

In [11]:
heart.shape

(918, 12)