In [None]:
# Heart Disease Data Preprocessing

This notebook will prepare our heart disease dataset for analysis. We'll:
1. Load and clean the data
2. Save it in the correct format for further analysis


### Step 1: Load the Dataset

In [48]:
import pandas as pd

df = pd.read_csv('../data/cleveland.csv', na_values=['-9', '?', 'name', 'sex'], header=0)
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,1.0,0,63.0,1.0,,,,,1,145.0,1.0,233.0,,50.0
1,20.0,1,,1.0,2.0,2.0,3.0,81.0,0,0.0,0.0,0.0,0.0,1.0
2,10.5,6,13.0,150.0,60.0,190.0,90.0,145.0,85,0.0,0.0,2.3,3.0,
3,172.0,0,,,,,,,6,,,,2.0,16.0
4,81.0,0,1.0,1.0,1.0,,1.0,,1,,1.0,1.0,1.0,1.0


In [50]:
threshold = len(df) * 0.5
df = df.dropna(axis=1, thresh=threshold)

df = df.dropna(thresh=10)  # Keep rows with at least 10 non-null values


### Step 2: Assign Column Names


In [40]:
df.columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]


### Step 3: Replace Missing or Invalid Values


In [51]:
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [52]:
df.fillna(df.median(numeric_only=True), inplace=True)

In [41]:
import numpy as np

df.replace('?', np.nan, inplace=True)


###  Step 4: Convert Columns to Numeric


In [42]:
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert or make NaN


### Step 5: Handle Missing Values


In [43]:
# Fill NaNs in numeric columns with the median
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)


### Step 6: Clean Target Column (Optional)
In the Cleveland dataset, the target column can have values 0, 1, 2, 3, 4, where 0 means no heart disease, and 1–4 mean different degrees of presence.


In [44]:
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

In [53]:
print("Final shape:", df.shape)
print("Any NaNs left?", df.isnull().values.any())
print("Column names:", df.columns.tolist())


Final shape: (864, 14)
Any NaNs left? False
Column names: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']


### Step 7: Save Final Clean Format


In [56]:
df.to_csv('../data/cleaned_cleveland.csv', index=False)
