In [1]:
from pathlib import Path

In [2]:
repo_dir = Path.cwd().parents[0]
repo_dir

PosixPath('/workspaces/heart-disease')

In [3]:
import sys

sys.path.insert(0, f"{repo_dir}")

import numpy as np
import pandas as pd

import heart_disease.constants as hdc
import heart_disease.data.get_dataset as hddgd  # FIXME: Add get_dataset to heart_disease?

In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Load data

In [5]:
hddgd.get_dataset()
combined_df = hddgd.validate_and_combine_dataset()

Data files already downloaded.


In [6]:
print(combined_df.shape)
combined_df.head()

(920, 15)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,source
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0,cleveland
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0,cleveland
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0,cleveland
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0,cleveland
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0,cleveland


In [None]:
# FIXME: Move below into separate transform script. Save to interim feather file.

In [7]:
# Convert integer columns.

integer_cols = [
    # Age in years.
    'age',
    # Resting blood pressure in mm Hg on admission to the hospital.
    'trestbps', 
    # Serum cholestoral in mg/dl.
    'chol',
    # Maximum heart rate achieved.
    'thalach',
    # Number of major vessels (0-3) colored by flourosopy.
    'ca',
]

all(
    (
        combined_df[col].apply(lambda f: f.is_integer() or np.isnan(f)).all()
        for col in integer_cols
    )
)

True

In [8]:
for col in integer_cols:
    # Int64Dtype allows for nulls.
    combined_df[col] = combined_df[col].astype(pd.Int64Dtype())

In [9]:
# Convert categoricals.

In [10]:
combined_df['sex'] = combined_df['sex'].astype(
    dtype='category',
).cat.rename_categories(
    new_categories={
        0.0: 'female',
        1.0: 'male',
    }
)

In [11]:
# Chest pain type.
combined_df['cp'] = combined_df['cp'].astype(
    dtype='category',
).cat.rename_categories(
    new_categories={
        1.0: 'typical angina',
        2.0: 'atypical angina',
        3.0: 'non-anginal pain',
        4.0: 'asymptomatic',
    }
)

In [12]:
# Exercise induced angina.
combined_df['exang'] = combined_df['exang'].fillna(
    value=-1.0,
).astype(
    dtype='category',
).cat.rename_categories(
    new_categories={
        -1.0: 'missing value',
        0.0: 'no exercise induced angina',
        1.0: 'exercise induced angina',
    }
)

In [13]:
# Slope of the peak exercise ST segment.
combined_df['slope'] = combined_df['slope'].fillna(
    value=-1.0,
).astype(
    dtype='category',
).cat.rename_categories(
    new_categories={
        -1.0: 'missing value',
        1.0: 'upsloping',
        2.0: 'flat',
        3.0: 'downsloping',
    }
)

In [14]:
combined_df['thal'] = combined_df['thal'].fillna(
    value=-1.0,
).astype(
    dtype='category',
).cat.rename_categories(
    new_categories={
        -1.0: 'missing value',
        3.0: 'normal',
        6.0: 'fixed defect',
        7.0: 'reversable defect',
    }
)

In [15]:
# Fasting blood sugar.
combined_df['fbs'] = combined_df['fbs'].fillna(
    value=-1.0,
).astype(
    dtype='category',
).cat.rename_categories(
    new_categories={
        -1.0: 'missing value',
        0.0: 'low fasting blood sugar',
        1.0: 'high fasting blood sugar',
    }
)

In [16]:
# Resting electrocardiographic results.
combined_df['restecg'] = combined_df['restecg'].fillna(
    value=-1.0,
).astype(
    dtype='category',
).cat.rename_categories(
    new_categories={
        -1.0: 'missing value',
        0.0: 'normal',
        1.0: 'ST-T wave abnormality',
        2.0: 'left ventricular hypertrophy',
    }
)

In [None]:
# Float columns.
# `oldpeak`, the ST depression induced by exercise relative to rest.

In [18]:
combined_df.columns[pd.isnull(combined_df).sum() > 0].values

array(['trestbps', 'chol', 'thalach', 'oldpeak', 'ca'], dtype=object)

How handle nulls:

[

1. ints - 'trestbps', 'chol', 'thalach', 'ca',
2. floats - 'oldpeak'


In [19]:
combined_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,920.0,,,,53.51087,9.424685,28.0,47.0,54.0,60.0,77.0
sex,920.0,2.0,male,726.0,,,,,,,
cp,920.0,4.0,asymptomatic,496.0,,,,,,,
trestbps,861.0,,,,132.132404,19.06607,0.0,120.0,130.0,140.0,200.0
chol,890.0,,,,199.130337,110.78081,0.0,175.0,223.0,268.0,603.0
fbs,920.0,3.0,low fasting blood sugar,692.0,,,,,,,
restecg,920.0,4.0,normal,551.0,,,,,,,
thalach,865.0,,,,137.545665,25.926276,60.0,120.0,140.0,157.0,202.0
exang,920.0,3.0,no exercise induced angina,528.0,,,,,,,
oldpeak,858.0,,,,0.878788,1.091226,-2.6,0.0,0.5,1.5,6.2
