In [None]:
# ===========================================================
# FEATURE ENGINEERING PIPELINE FOR DEMAND FORECASTING
# ===========================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

print("Loading cleaned dataset...")
df = pd.read_csv("../data/cleaned_retail_data.csv", parse_dates=["Date"])

# ===========================================================
# 1. DATE FEATURES
# ===========================================================
print("Creating date-based features...")

df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['weekday'] = df['Date'].dt.weekday
df['weekofyear'] = df['Date'].dt.isocalendar().week.astype(int)
df['is_weekend'] = (df['weekday'] >= 5).astype(int)
df['is_month_start'] = df['Date'].dt.is_month_start.astype(int)
df['is_month_end'] = df['Date'].dt.is_month_end.astype(int)

# ===========================================================
# 2. DAILY AGGREGATION FOR FORECASTING
# ===========================================================
print("Aggregating to daily demand...")

daily = (
    df.groupby(['Date','Product_Category','Country'])['Total_Purchases']
      .sum()
      .reset_index()
      .sort_values(['Product_Category','Country','Date'])
)

# ===========================================================
# 3. LAG FEATURES (7, 14, 21 DAYS)
# ===========================================================
print("Creating lag features...")

daily['lag_7'] = daily.groupby(['Product_Category','Country'])['Total_Purchases'].shift(7)
daily['lag_14'] = daily.groupby(['Product_Category','Country'])['Total_Purchases'].shift(14)
daily['lag_21'] = daily.groupby(['Product_Category','Country'])['Total_Purchases'].shift(21)

# ===========================================================
# 4. ROLLING FEATURES (7 & 14 DAYS)
# ===========================================================
print("Creating rolling-window features...")

daily['roll_mean_7'] = daily.groupby(['Product_Category','Country'])['Total_Purchases'].shift(1).rolling(7).mean()
daily['roll_std_7']  = daily.groupby(['Product_Category','Country'])['Total_Purchases'].shift(1).rolling(7).std()

daily['roll_mean_14'] = daily.groupby(['Product_Category','Country'])['Total_Purchases'].shift(1).rolling(14).mean()
daily['roll_std_14']  = daily.groupby(['Product_Category','Country'])['Total_Purchases'].shift(1).rolling(14).std()

# ===========================================================
# 5. TRAIN/TEST SPLIT (LAST 60 DAYS = TEST)
# ===========================================================
print("Creating train/test split...")

split_date = daily['Date'].max() - pd.Timedelta(days=60)
daily_train = daily[daily['Date'] <= split_date].copy()
daily_test  = daily[daily['Date']  > split_date].copy()

# ===========================================================
# 6. TOP-N PRODUCT GROUPING (TRAIN ONLY)
# ===========================================================
print("Applying TOP-N product grouping...")

TOP_N = 50
top_products = df['products'].value_counts().head(TOP_N).index.tolist()

df['products_grouped'] = df['products'].apply(lambda x: x if x in top_products else 'OTHER')

# merge product grouping into daily frames
daily_train = daily_train.merge(
    df[['Date','Product_Category','Country','products_grouped']],
    on=['Date','Product_Category','Country'],
    how='left'
)

daily_test = daily_test.merge(
    df[['Date','Product_Category','Country','products_grouped']],
    on=['Date','Product_Category','Country'],
    how='left'
)

# ===========================================================
# 7. FREQUENCY ENCODING (TRAIN ONLY)
# ===========================================================
print("Applying frequency encoding...")

freq = daily_train['products_grouped'].value_counts(normalize=True)
daily_train['product_freq'] = daily_train['products_grouped'].map(freq)
daily_test['product_freq']  = daily_test['products_grouped'].map(freq).fillna(0)

# ===========================================================
# 8. TARGET ENCODING (TRAIN ONLY)
# ===========================================================
print("Applying target encoding...")

global_mean = daily_train['Total_Purchases'].mean()
target_enc = daily_train.groupby('products_grouped')['Total_Purchases'].mean()

daily_train['product_target_enc'] = daily_train['products_grouped'].map(target_enc)
daily_test['product_target_enc']  = daily_test['products_grouped'].map(target_enc).fillna(global_mean)

# ===========================================================
# 9. LABEL ENCODING FOR CATEGORY + COUNTRY
# ===========================================================
print("Label encoding category and country...")

LE_cols = ['Product_Category','Country','products_grouped']

le = LabelEncoder()
for col in LE_cols:
    daily_train[col] = le.fit_transform(daily_train[col].astype(str))
    daily_test[col]  = le.transform(daily_test[col].astype(str))

# ===========================================================
# 10. DROP NA ROWS CAUSED BY LAGS
# ===========================================================
print("Dropping NA lag rows...")

daily_train = daily_train.dropna().reset_index(drop=True)
daily_test  = daily_test.dropna().reset_index(drop=True)

# ===========================================================
# 11. SAVE FINAL DATASETS
# ===========================================================
print("Saving final engineered datasets...")

daily_train.to_csv("final_train.csv", index=False)
daily_test.to_csv("final_test.csv", index=False)

print("Feature Engineering Complete âœ“")
