In [1]:
"""
Feature Engineering Pipeline — Version 3 (Leakage-free, robust, train-only encodings)
Save as: feature_engineering_v3.py

Key improvements over v2:
 - compute train/test split on transaction-level BEFORE any train-only encodings
 - frequency & brand encodings computed only on train and mapped to both sets
 - deduplicate before merging transactional-level product grouping into daily
 - per-column LabelEncoder instances (saved)
 - safe target encoding (smoothed) computed on train only
 - robust rolling / lag computation using group transforms
 - checks for missing columns and informative errors
 - saves engineered datasets and pickled encoders

Usage: run in same environment as your data; adjust paths and TOP_N as needed.
"""

import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# ------------------------
# Config
# ------------------------
RAW_PATH = Path("../data/cleaned_retail_data.csv")
OUT_DIR = Path("../data")
OUT_DIR.mkdir(parents=True, exist_ok=True)
TRAIN_OUT = OUT_DIR / "final_train_v3.csv"
TEST_OUT = OUT_DIR / "final_test_v3.csv"
ENCODERS_OUT = OUT_DIR / "encoders_v3.pkl"
PICKLE_PROTOCOL = 4
TOP_N = 50  # top N products for grouping
ROLL_WINDOWS = [7, 14, 30]
LAGS = [1, 2, 3, 7, 14, 21]

# ------------------------
# Helpers
# ------------------------

def ensure_columns(df, cols):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise KeyError(f"Missing required columns in input: {missing}")


def smoothed_target_encoding(series, target, alpha=10):
    """
    Smoothed target encoding (train-only). Returns mapping dict.
    alpha: higher -> stronger smoothing toward global mean.
    """
    stats = target.groupby(series).agg(['mean', 'count'])
    global_mean = target.mean()
    # smoothed = (count * mean + alpha * global_mean) / (count + alpha)
    smoothed = ((stats['count'] * stats['mean']) + (alpha * global_mean)) / (stats['count'] + alpha)
    return smoothed.to_dict(), global_mean


# ------------------------
# Load and validate
# ------------------------
print("Loading cleaned dataset...")
df = pd.read_csv(RAW_PATH, parse_dates=["Date"])  # will raise if file missing

# required transactional columns
required_cols = ["Date", "Product_Category", "Country", "Total_Purchases", "Product_Brand", "City"]
ensure_columns(df, required_cols)

# decide product column name
product_col = None
for cand in ["products", "Product_Name", "Product", "SKU"]:
    if cand in df.columns:
        product_col = cand
        break

if product_col is None:
    # we don't crash — create a placeholder by category if product-level is missing
    product_col = 'products'
    df[product_col] = df['Product_Category'].astype(str) + "::UNKNOWN_PRODUCT"
    print(f"Warning: no explicit product column found. Created placeholder column '{product_col}'.")

# ------------------------
# 1) Basic date features on transactions
# ------------------------
print("Creating transaction-level date features...")
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['weekday'] = df['Date'].dt.weekday

# cyclic encodings
df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# ------------------------
# 2) Time-based split ON TRANSACTION LEVEL (avoid leakage)
# ------------------------
print("Applying time-based transaction split (train/test)...")
cut_date = df['Date'].max() - pd.Timedelta(days=60)
trans_train = df[df['Date'] <= cut_date].copy()
trans_test = df[df['Date'] >  cut_date].copy()

print(f"transactions: total={len(df)}, train={len(trans_train)}, test={len(trans_test)}")

# ------------------------
# 3) Frequency encodings computed on train transactions and mapped everywhere
# ------------------------
print("Computing train-only frequency encodings for Product_Brand and City...")
brand_freq = trans_train['Product_Brand'].value_counts(normalize=True)
city_freq  = trans_train['City'].value_counts(normalize=True)

# map back to full df (we map using df index to keep alignment), fill missing with 0
for dset in (df, trans_train, trans_test):
    dset['Product_Brand_freq'] = dset['Product_Brand'].map(brand_freq).fillna(0.0)
    dset['City_freq'] = dset['City'].map(city_freq).fillna(0.0)

# ------------------------
# 4) Create products_grouped using TOP_N from train transactions
# ------------------------
print("Creating TOP-N product grouping from train transactions...")
TOP_products = trans_train[product_col].value_counts().head(TOP_N).index.tolist()

def group_product(x):
    return x if x in TOP_products else 'OTHER'

for dset in (df, trans_train, trans_test):
    dset['products_grouped'] = dset[product_col].apply(group_product)

# deduplicate mapping table for later merge
prod_mapping = df[['Date', 'Product_Category', 'Country', 'products_grouped']].drop_duplicates()

# ------------------------
# 5) Aggregate to daily level (on the full set) — use transaction-level freq enc columns
# ------------------------
print("Aggregating to daily demand (per Product_Category x Country x Date)...")

agg_cols = {
    'Total_Purchases': 'sum',
    # keep a few transaction-level aggregations as features
    'Product_Brand_freq': 'mean',
    'City_freq': 'mean'
}

# if 'Amount' exists, include it; same for 'Total_Amount'
for optional in ['Amount', 'Total_Amount']:
    if optional in df.columns:
        agg_cols[optional] = 'mean'


daily = (
    df.groupby(['Date', 'Product_Category', 'Country'])
      .agg(agg_cols)
      .reset_index()
      .sort_values(['Product_Category', 'Country', 'Date'])
)

# ------------------------
# 6) Merge deduped product grouping into daily safely
# ------------------------
print("Merging product grouping into daily (deduplicated)...")
daily = daily.merge(prod_mapping, on=['Date','Product_Category','Country'], how='left')

# if some daily rows miss grouping, fill as 'OTHER'
daily['products_grouped'] = daily['products_grouped'].fillna('OTHER')

# ------------------------
# 7) Now compute rolling and lag features grouped by Product_Category + Country
# ------------------------
print("Creating lags and rolling windows on aggregated data...")

daily = daily.sort_values(['Product_Category','Country','Date']).reset_index(drop=True)

group_cols = ['Product_Category','Country']

def group_transform(df, colname, fn):
    return df.groupby(group_cols)[colname].transform(fn)

# LAGS
for lag in LAGS:
    daily[f'lag_{lag}'] = daily.groupby(group_cols)['Total_Purchases'].shift(lag)

# Rolling windows — use shift(1) to avoid including current day
for w in ROLL_WINDOWS:
    daily[f'roll_mean_{w}'] = daily.groupby(group_cols)['Total_Purchases'].transform(lambda x: x.shift(1).rolling(window=w, min_periods=1).mean())
    daily[f'roll_std_{w}']  = daily.groupby(group_cols)['Total_Purchases'].transform(lambda x: x.shift(1).rolling(window=w, min_periods=1).std())

# Category and country level rolling (30-day) — separate features
print("Category and country-level rolling features...")
daily['cat_roll_30'] = daily.groupby('Product_Category')['Total_Purchases'].transform(lambda x: x.shift(1).rolling(30, min_periods=1).mean())
daily['country_roll_30'] = daily.groupby('Country')['Total_Purchases'].transform(lambda x: x.shift(1).rolling(30, min_periods=1).mean())

# ------------------------
# 8) Time-based split on aggregated DAILY data (same cut_date)
# ------------------------
print("Splitting aggregated data into train/test (daily)...")
train_daily = daily[daily['Date'] <= cut_date].copy()
test_daily  = daily[daily['Date'] >  cut_date].copy()

print(f"daily rows: total={len(daily)}, train={len(train_daily)}, test={len(test_daily)})")

# ------------------------
# 9) Frequency encoding of products_grouped (train-only)
# ------------------------
print("Applying frequency encoding for products_grouped (train-only)...")
freq_map = train_daily['products_grouped'].value_counts(normalize=True).to_dict()
train_daily['product_freq'] = train_daily['products_grouped'].map(freq_map)
test_daily['product_freq']  = test_daily['products_grouped'].map(freq_map).fillna(0.0)

# ------------------------
# 10) Smoothed target encoding (train-only)
# ------------------------
print("Applying smoothed target encoding for products_grouped (train-only)...")
te_map, global_mean = smoothed_target_encoding(train_daily['products_grouped'], train_daily['Total_Purchases'], alpha=20)
train_daily['product_target_enc'] = train_daily['products_grouped'].map(te_map)
test_daily['product_target_enc']  = test_daily['products_grouped'].map(te_map).fillna(global_mean)

# ------------------------
# 11) Label Encoding (per-column encoders saved)
# ------------------------
print("Label encoding category/country/products_grouped with independent encoders...")
LE_cols = ['Product_Category','Country','products_grouped']

encoders = {}
for col in LE_cols:
    le = LabelEncoder()
    # fit on train union test categories seen in train+test to avoid unseen error in transform later
    combined = pd.concat([train_daily[col].astype(str), test_daily[col].astype(str)]).unique()
    le.fit(combined)
    train_daily[col + '_le'] = le.transform(train_daily[col].astype(str))
    test_daily[col + '_le']  = le.transform(test_daily[col].astype(str))
    encoders[col] = le

# ------------------------
# 12) Drop rows with NA that are unavoidable (e.g., first N days for lags)
# ------------------------
print("Dropping rows with NA introduced by lag/rolling features in TRAIN only (keep test as-is to inspect)...")
pre_drop_train = len(train_daily)
train_daily = train_daily.dropna().reset_index(drop=True)
post_drop_train = len(train_daily)
print(f"Dropped {pre_drop_train - post_drop_train} rows from train due to NA (lags/rolls).")

# For test, it's often useful to keep NA (or impute). We'll drop rows where the target is NA (shouldn't be) but keep feature NA if you plan to impute later.
# If you want to drop NA in test as well, uncomment below line.
# test_daily = test_daily.dropna().reset_index(drop=True)

# ------------------------
# 13) Persist artifacts and final datasets
# ------------------------
print("Saving final datasets and encoders...")
train_daily.to_csv(TRAIN_OUT, index=False)
test_daily.to_csv(TEST_OUT, index=False)
with open(ENCODERS_OUT, 'wb') as f:
    pickle.dump({'label_encoders': encoders, 'brand_freq': brand_freq.to_dict(), 'city_freq': city_freq.to_dict(), 'product_top_n': TOP_products, 'target_encoding_map': te_map, 'target_global_mean': global_mean}, f, protocol=PICKLE_PROTOCOL)

print("Feature Engineering v3 Complete ✓")
print(f"Saved: {TRAIN_OUT}, {TEST_OUT}, {ENCODERS_OUT}")


Loading cleaned dataset...
Creating transaction-level date features...
Applying time-based transaction split (train/test)...
transactions: total=301290, train=251542, test=49748
Computing train-only frequency encodings for Product_Brand and City...
Creating TOP-N product grouping from train transactions...
Aggregating to daily demand (per Product_Category x Country x Date)...
Merging product grouping into daily (deduplicated)...
Creating lags and rolling windows on aggregated data...
Category and country-level rolling features...
Splitting aggregated data into train/test (daily)...
daily rows: total=64245, train=53705, test=10540)
Applying frequency encoding for products_grouped (train-only)...
Applying smoothed target encoding for products_grouped (train-only)...
Label encoding category/country/products_grouped with independent encoders...
Dropping rows with NA introduced by lag/rolling features in TRAIN only (keep test as-is to inspect)...
Dropped 735 rows from train due to NA (lags/