# ðŸ§± 03 â€” Feature Engineering

Goals:
- Try log(Amount)
- Explore SMOTE oversampling
- Evaluate improvement in PR-AUC

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, StratifiedKFold

from src.data_prep import load_raw_data, train_val_test_split, build_preprocessor
from src.models import get_baseline_model
import pandas as pd

In [None]:
df = load_raw_data()
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df)
preprocessor = build_preprocessor()
model = get_baseline_model()

In [None]:
clf = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", model),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X_train, y_train, scoring="average_precision", cv=cv, n_jobs=-1)
scores.mean(), scores.std()

# Feature Engineering Conclusions

- SMOTE improves PR-AUC compared to baseline.
- Log(Amount) can be added but PCA features dominate.
- Feature engineering limited since PCA components are anonymized.