# Cleaning and Feature Engineering

In [None]:
import pandas as pd
import sys
import os

# Dynamic Path Setup
sys.path.append(os.path.abspath(os.path.join('..')))

from src import config
from src.data_loader import load_raw_data
from src import preprocessing as pp # Import our new toolkit

print("✅ Setup complete.")

In [None]:
df = load_raw_data()
original_shape = df.shape
df.head()

### Cleaning

In [None]:
# 1. Standardize Names
df = pp.clean_column_names(df)

# 2. Drop Duplicates
df = df.drop_duplicates()

# 3. Handle Missing Values
# Can be changed to 'mean' or 'constant' as well
df = pp.handle_missing_values(df, strategy='median')

print(f"Shape after cleaning: {df.shape} (Dropped {original_shape[0] - df.shape[0]} rows)")

### Feature Engineering

### Outlier Removal (if needed)

In [None]:
# Define columns to check for outliers (Numeric only)
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove target from outlier check (don't remove rows just because they are defaults!)
if config.TARGET_COLUMN in num_cols: 
    num_cols.remove(config.TARGET_COLUMN)

# Apply removal
# df = pp.remove_outliers_iqr(df, columns=num_cols)
# print(f"Shape after outlier removal: {df.shape}")

### Encoding

In [None]:
# Encode Categorical Variables
df = pp.encode_categorical(df, target_col=config.TARGET_COLUMN)

# Final check
df.info()

### Save

In [None]:
save_path = os.path.join("..", "data", "processed", "clean_data.csv")

# Save
df.to_csv(save_path, index=False)
print(f"✅ Clean data saved to: {save_path}")