# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from category_encoders import TargetEncoder # Optional: for high-cardinality

import sys
sys.path.append(os.path.abspath(os.path.join('..')))
from src import config

print("✅ Setup complete.")

In [None]:
# Generic loading mechanism
RAW_DATA_PATH = '../data/raw/dataset.csv' 
df = pd.read_csv(RAW_DATA_PATH)

TARGET_COL = 'target' # Replace with actual target

# Basic sanity drop if target is missing
df = df.dropna(subset=[TARGET_COL])

### Train/Test Split

In [None]:
# Separate features and target
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Split IMMEDIATELY. The test set goes into the vault.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y if y.dtype == 'object' else None
)

print(f"Training shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Testing shapes: X={X_test.shape}, y={y_test.shape}")

### Feature Grouping

In [None]:
# Automatically categorize columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Note: In a real project, manually move high-cardinality or ordinal columns 
# to separate lists here if they require different treatment (e.g., TargetEncoding)

print(f"Numeric features ({len(num_cols)}): {num_cols[:5]}...")
print(f"Categorical features ({len(cat_cols)}): {cat_cols[:5]}...")

### Building Preprocessor

In [None]:
# Numeric Rules: Fill missing with training median, then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Rules: Fill missing with 'missing', then One-Hot Encode
# handle_unknown='ignore' ensures the pipeline doesn't break if production 
# data introduces a brand new category not seen in X_train.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine into a single engine
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop' # Drops any columns not explicitly defined above
)

### Fit and Transform

In [None]:
# FIT AND TRANSFORM on training data (Learn the rules and apply them)
X_train_processed = preprocessor.fit_transform(X_train)

# ONLY TRANSFORM on test data (Apply the learned rules blindly)
X_test_processed = preprocessor.transform(X_test)

# Optional: Retrieve feature names if needed for downstream interpretability
feature_names = preprocessor.get_feature_names_out()
print(f"Resulting feature count: {len(feature_names)}")

### Save

In [None]:
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Re-attach target variables and save to CSVs for the modeling notebook
train_fe = pd.DataFrame(X_train_processed, columns=feature_names)
train_fe[TARGET_COL] = y_train.reset_index(drop=True)

test_fe = pd.DataFrame(X_test_processed, columns=feature_names)
test_fe[TARGET_COL] = y_test.reset_index(drop=True)

train_fe.to_csv('../data/processed/train_fe.csv', index=False)
test_fe.to_csv('../data/processed/test_fe.csv', index=False)

# Save the Pipeline Artifact!
PIPELINE_PATH = '../models/preprocessor.joblib'
joblib.dump(preprocessor, PIPELINE_PATH)

print("✅ Preprocessing complete. Artifacts saved.")