In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

# Load dataset
df = pd.read_csv(r'C:\Users\USER\Documents\Heart_Disease Kaggle\Data\heart.csv')

print("="*80)
print("✓ PHASE 4: DATA PREPROCESSING & FEATURE ENGINEERING")
print("="*80)

# STEP 1: Handle missing values (zeros)
print("\n1. Handling missing values...")
for disease_status in [0, 1]:
    median_bp = df[df['HeartDisease'] == disease_status]['RestingBP'].median()
    median_chol = df[df['HeartDisease'] == disease_status]['Cholesterol'].median()
    
    df.loc[(df['RestingBP'] == 0) & (df['HeartDisease'] == disease_status), 'RestingBP'] = median_bp
    df.loc[(df['Cholesterol'] == 0) & (df['HeartDisease'] == disease_status), 'Cholesterol'] = median_chol

print(f"   RestingBP zeros: {(df['RestingBP'] == 0).sum()}")
print(f"   Cholesterol zeros: {(df['Cholesterol'] == 0).sum()}")

# STEP 2: Encode categorical variables
print("\n2. Encoding categorical variables...")
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina']
encoders = {}

# Label encode binary
for col in ['Sex', 'ExerciseAngina']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    print(f"   {col}: Encoded")

# One-hot encode nominal
for col in ['ChestPainType', 'RestingECG', 'ST_Slope']:
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop(col, axis=1, inplace=True)

print(f"   Dataset shape after encoding: {df.shape}")

# STEP 3: Feature engineering
print("\n3. Creating new features...")
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 30, 40, 50, 60, 100], labels=[0, 1, 2, 3, 4])
df['HR_Age_Ratio'] = df['MaxHR'] / (df['Age'] + 1)
df['Cholesterol_High'] = (df['Cholesterol'] > 200).astype(int)
print(f"   3 new features created")

# STEP 4: Split train-test (80-20 stratified)
print("\n4. Splitting train-test sets...")
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"   Train set: {X_train.shape}")
print(f"   Test set: {X_test.shape}")

# STEP 5: Scale numerical features
print("\n5. Scaling numerical features...")
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'HR_Age_Ratio']
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

X_test_scaled = X_test.copy()
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

print(f"   Features scaled successfully")

# STEP 6: Save artifacts
print("\n6. Saving preprocessed data and artifacts...")
# Ensure target directories exist
os.makedirs('data', exist_ok=True)
os.makedirs('models', exist_ok=True)

import os
os.makedirs('data', exist_ok=True)
# Ensure scaled outputs are DataFrames before saving
if isinstance(X_train_scaled, np.ndarray):
    X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
else:
    X_train_df = X_train_scaled
if isinstance(X_test_scaled, np.ndarray):
    X_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
else:
    X_test_df = X_test_scaled
X_train_df.to_csv('data/X_train_scaled.csv', index=False)
X_test_df.to_csv('data/X_test_scaled.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('models/encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

print(f"   All files saved successfully")

print("\n" + "="*80)
print("✓ PHASE 4 COMPLETE!")
print("="*80)
print(f"\nOriginal features: {df.shape}")
print(f"Final features: {X_train_scaled.shape}")
print(f"\nReady for Phase 5: Model Building")


In [None]:
for disease_status in [0, 1]:
    median_bp = df[df['HeartDisease']==disease_status]['RestingBP'].median()
    df.loc[(df['RestingBP']==0)&(df['HeartDisease']==disease_status), 'RestingBP'] = median_bp


In [None]:
# Label encode binary (only if needed)
if 'Sex' in df.columns and df['Sex'].dtype == object:
    le = LabelEncoder()
    df['Sex'] = le.fit_transform(df['Sex'])
    print("  Sex: encoded")
else:
    print("  Sex already encoded or not present; skipping")

# One-hot encode nominal (only if column exists)
if 'ChestPainType' in df.columns:
    dummies = pd.get_dummies(df['ChestPainType'], prefix='CPT', drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df.drop('ChestPainType', axis=1, inplace=True)
    print("  ChestPainType: one-hot encoded")
else:
    print("  ChestPainType not found; skipping one-hot encoding")


In [None]:
df['Age_Group'] = pd.cut(df['Age'], bins=[0,30,40,50,60,100], labels=[0,1,2,3,4])
df['HR_Age_Ratio'] = df['MaxHR'] / (df['Age'] + 1)
df['Cholesterol_High'] = (df['Cholesterol'] > 200).astype(int)


In [None]:
def find_outliers_iqr(data):
    Q1, Q3 = data.quantile(0.25), data.quantile(0.75)
    IQR = Q3 - Q1
    return (data < Q1-1.5*IQR) | (data > Q3+1.5*IQR)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
scaler = StandardScaler()
# Determine numeric columns to scale (safe fallback)
numerical_cols = [c for c in X_train.columns if X_train[c].dtype.kind in 'bifc']
# Create DataFrame copies and scale numeric columns only
X_train_scaled = X_train.copy()
X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = X_test.copy()
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [None]:
X_train_scaled.to_csv('data/X_train_scaled.csv', index=False)
X_test_scaled.to_csv('data/X_test_scaled.csv', index=False)


In [None]:
with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [None]:
print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test_scaled.shape}")
print(f"Missing values: {X_train_scaled.isnull().sum().sum()}")


## Phase 4 — Data Preprocessing & Feature Engineering

This document summarizes the preprocessing and feature engineering performed in `Notebooks/Phase4.ipynb`, lists the artifacts produced, and provides quick run and troubleshooting instructions.

- **Purpose:** Prepare the cleaned, encoded, and scaled datasets for modeling (Phase 5). Create reproducible preprocessing artifacts such as scaled CSVs and serialized transformers/encoders.
- **Notebook:** `Notebooks/Phase4.ipynb`

**Produced Artifacts**
- `data/X_train_scaled.csv`: Scaled training features (DataFrame CSV).
- `data/X_test_scaled.csv`: Scaled test features (DataFrame CSV).
- `data/y_train.csv`, `data/y_test.csv`: Target splits as CSVs.
- `models/scaler.pkl`: Pickled `StandardScaler` fitted on training data.
- `models/encoders.pkl`: Pickled dictionary of encoders (e.g., `LabelEncoder` instances).

**Main Steps (high level)**
- Handle missing / sentinel values: median-impute zeros in `RestingBP` and `Cholesterol` grouped by `HeartDisease` status.
- Encode categorical variables:
  - Label-encode binary columns (e.g., `Sex`, `ExerciseAngina`).
  - One-hot encode nominal columns (e.g., `ChestPainType`, `RestingECG`, `ST_Slope`) when present.
- Create features such as `Age_Group`, `HR_Age_Ratio`, and `Cholesterol_High`.
- Split the dataset via `train_test_split(..., stratify=y, test_size=0.2)`.
- Scale numeric columns with `StandardScaler` fit on the training set only.
- Persist CSVs and pickled artifacts to the `data/` and `models/` directories.

**How to run (PowerShell)**
1. From the project root, execute the notebook headless (example):

```powershell
python -m nbconvert --to notebook --execute "Notebooks\Phase4.ipynb" --output "Notebooks\Phase4_executed.ipynb"
```

2. Or run interactively in VS Code / Jupyter and execute cells in order.

**Notes & Troubleshooting**
- Missing output directories: The notebook creates `data/` and `models/` using `os.makedirs(..., exist_ok=True)` before saving. If you still see a `FileNotFoundError`, confirm file system permissions and that the notebook is run from the project root.
- `AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'`: Fixed — numeric scaling is applied to numeric columns of DataFrame copies so `X_train_scaled` / `X_test_scaled` remain DataFrames. If you encounter this, ensure the scaling cell produces DataFrames (not raw numpy arrays) before calling `to_csv`.
- Missing categorical columns (e.g., `ChestPainType`): The notebook checks for column existence before one-hot encoding and skips the step if the column is absent.
- Encoder serialization: `models/encoders.pkl` contains a dictionary of fitted encoders; ensure you load these when transforming new data for inference.

