In [None]:
## Heart Disease Data Preprocessing

This notebook will prepare our heart disease dataset for analysis. We'll:
1. Load and clean the data
2. Save it in the correct format for further analysis


In [62]:
import pandas as pd
import numpy as np


# Step 1: Load dataset with proper missing value handling
df = pd.read_csv('../data/cleveland.csv', na_values=['-9', '?', 'name', 'sex'])


In [63]:
# Step 2: Drop columns with too many missing values (more than 50%)
df = df.dropna(axis=1, thresh=len(df) * 0.5)

In [64]:
# Step 3: Drop rows with too few valid entries (less than 10 non-null)
df = df.dropna(thresh=10)

In [65]:
# Step 4: Convert all columns to numeric
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [66]:
# Step 5: Fill missing values with median for numeric columns
df.fillna(df.median(numeric_only=True), inplace=True)

In [67]:
# Step 6: Define medically valid value ranges for outlier handling
valid_ranges = {
    'age': (29, 77),
    'sex': (0, 1),
    'cp': (0, 3),
    'trestbps': (94, 200),
    'chol': (126, 564),
    'fbs': (0, 1),
    'restecg': (0, 2),
    'thalach': (71, 202),
    'exang': (0, 1),
    'oldpeak': (0.0, 6.2),
    'slope': (0, 2),
    'ca': (0, 3),
    'thal': (3, 7),
    'target': (0, 1)
}

In [68]:
# Step 7: Clip values to domain-valid ranges
for col, (min_val, max_val) in valid_ranges.items():
    if col in df.columns:
        df[col] = df[col].clip(lower=min_val, upper=max_val)

In [69]:
# Step 8: Strictly validate and fix categorical columns with fixed value sets
strict_categorical = {
    'cp': [0, 1, 2, 3],
    'restecg': [0, 1, 2],
    'slope': [0, 1, 2],
    'ca': [0, 1, 2, 3],
    'thal': [3, 6, 7],
    'sex': [0, 1],
    'fbs': [0, 1],
    'exang': [0, 1],
    'target': [0, 1]
}

In [70]:
# Step 9: Round and enforce valid class values
for col, valid_vals in strict_categorical.items():
    if col in df.columns:
        df[col] = df[col].round(0).astype('Int64')
        df[col] = df[col].where(df[col].isin(valid_vals), pd.NA)
        df[col].fillna(df[col].mode()[0], inplace=True)

In [71]:
# Step 10: Final review of the cleaned data
print("Cleaned dataset summary:")
print(df.info())
print(df.describe())

Cleaned dataset summary:
<class 'pandas.core.frame.DataFrame'>
Index: 864 entries, 1 to 1613
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       864 non-null    float64
 1   sex       864 non-null    Int64  
 2   cp        864 non-null    Int64  
 3   trestbps  864 non-null    float64
 4   chol      864 non-null    float64
 5   fbs       864 non-null    Int64  
 6   restecg   864 non-null    Int64  
 7   thalach   864 non-null    float64
 8   exang     864 non-null    Int64  
 9   oldpeak   864 non-null    float64
 10  slope     864 non-null    Int64  
 11  ca        864 non-null    Int64  
 12  thal      864 non-null    Int64  
 13  target    864 non-null    Int64  
dtypes: Int64(9), float64(5)
memory usage: 108.8 KB
None
              age       sex        cp    trestbps        chol       fbs  \
count  864.000000     864.0     864.0  864.000000  864.000000     864.0   
mean    33.947917  0.797454  1.590278  109.

In [72]:
df.to_csv('../data/cleaned_cleveland.csv', index=False)
