In [6]:
# ===========================================================
# FEATURE ENGINEERING PIPELINE (VERSION 2 - IMPROVED)
# Massive upgrade: richer seasonal features, extra lags,
# rolling windows, brand/city encodings, interactions, etc.
# ===========================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

print("Loading cleaned dataset...")
df = pd.read_csv("../data/cleaned_retail_data.csv", parse_dates=["Date"])

# ===========================================================
# 1. DATE FEATURES
# ===========================================================
print("Creating date features...")

df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['weekday'] = df['Date'].dt.weekday
df['weekofyear'] = df['Date'].dt.isocalendar().week.astype(int)
df['dayofyear'] = df['Date'].dt.dayofyear

df['is_weekend'] = (df['weekday'] >= 5).astype(int)

# cyclic (sin/cos) encoding for seasonality
df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# ===========================================================
# 2. ADD BRAND & CITY FREQUENCY ENCODING
# ===========================================================
print("Adding brand/city frequency encoding...")

df['Product_Brand_freq'] = df['Product_Brand'].map(df['Product_Brand'].value_counts(normalize=True))
df['City_freq'] = df['City'].map(df['City'].value_counts(normalize=True))

# ===========================================================
# 3. DAILY AGGREGATION (Project Objective)
# ===========================================================
print("Aggregating to daily demand...")

daily = (
    df.groupby(['Date','Product_Category','Country'])
      .agg({
           'Total_Purchases':'sum',
           'Amount':'mean',
           'Total_Amount':'mean',
           'Product_Brand_freq':'mean',
           'City_freq':'mean'
      })
      .reset_index()
      .sort_values(['Product_Category','Country','Date'])
)

# ===========================================================
# 4. ADD CATEGORY & COUNTRY ROLLING FEATURES
# ===========================================================
print("Category and country rolling windows...")

daily['cat_roll_30'] = daily.groupby('Product_Category')['Total_Purchases'].transform(lambda x: x.shift(1).rolling(30).mean())
daily['country_roll_30'] = daily.groupby('Country')['Total_Purchases'].transform(lambda x: x.shift(1).rolling(30).mean())

# ===========================================================
# 5. LAGS (1,2,3,7,14,21)
# ===========================================================
print("Creating lag features...")

grp = daily.groupby(['Product_Category','Country'])['Total_Purchases']

daily['lag_1']  = grp.shift(1)
daily['lag_2']  = grp.shift(2)
daily['lag_3']  = grp.shift(3)

daily['lag_7']  = grp.shift(7)
daily['lag_14'] = grp.shift(14)
daily['lag_21'] = grp.shift(21)

# ===========================================================
# 6. ROLLING WINDOWS (7,14,30)
# ===========================================================
print("Creating rolling-window features...")

daily['roll_mean_7']  = grp.shift(1).rolling(7).mean()
daily['roll_std_7']   = grp.shift(1).rolling(7).std()

daily['roll_mean_14'] = grp.shift(1).rolling(14).mean()
daily['roll_std_14']  = grp.shift(1).rolling(14).std()

daily['roll_mean_30'] = grp.shift(1).rolling(30).mean()
daily['roll_std_30']  = grp.shift(1).rolling(30).std()

# ===========================================================
# 7. TRAIN / TEST SPLIT (TIME-BASED)
# ===========================================================
print("Applying time-based split...")

split_date = daily['Date'].max() - pd.Timedelta(days=60)
train_daily = daily[daily['Date'] <= split_date].copy()
test_daily  = daily[daily['Date']  > split_date].copy()

# ===========================================================
# 8. TOP-N PRODUCT GROUPING (TRAIN ONLY)
# ===========================================================
print("Applying TOP-N product grouping...")

TOP_N = 50
top_products = df['products'].value_counts().head(TOP_N).index.tolist()

df['products_grouped'] = df['products'].apply(lambda x: x if x in top_products else 'OTHER')

train_daily = train_daily.merge(
    df[['Date','Product_Category','Country','products_grouped']],
    on=['Date','Product_Category','Country'], how='left'
)
test_daily = test_daily.merge(
    df[['Date','Product_Category','Country','products_grouped']],
    on=['Date','Product_Category','Country'], how='left'
)

# ===========================================================
# 9. FREQUENCY ENCODING (TRAIN ONLY)
# ===========================================================
print("Applying frequency encoding...")

freq = train_daily['products_grouped'].value_counts(normalize=True)
train_daily['product_freq'] = train_daily['products_grouped'].map(freq)
test_daily['product_freq']  = test_daily['products_grouped'].map(freq).fillna(0)

# ===========================================================
# 10. TARGET ENCODING (TRAIN ONLY)
# ===========================================================
print("Applying target encoding...")

global_mean = train_daily['Total_Purchases'].mean()
target_enc = train_daily.groupby('products_grouped')['Total_Purchases'].mean()

train_daily['product_target_enc'] = train_daily['products_grouped'].map(target_enc)
test_daily['product_target_enc']  = test_daily['products_grouped'].map(target_enc).fillna(global_mean)

# ===========================================================
# 11. LABEL ENCODING (Category + Country + Product Group)
# ===========================================================
print("Label encoding columns...")

LE_cols = ['Product_Category','Country','products_grouped']

le = LabelEncoder()
for col in LE_cols:
    train_daily[col] = le.fit_transform(train_daily[col].astype(str))
    test_daily[col]  = le.transform(test_daily[col].astype(str))

# ===========================================================
# 12. DROP NA after lags
# ===========================================================
print("Dropping NA rows...")

train_daily = train_daily.dropna().reset_index(drop=True)
test_daily  = test_daily.dropna().reset_index(drop=True)

# ===========================================================
# 13. SAVE FINAL FILES
# ===========================================================
print("Saving final engineered datasets...")

train_daily.to_csv("../data/final_train_v2.csv", index=False)
test_daily.to_csv("../data/final_test_v2.csv", index=False)

print("\nFeature Engineering v2 Complete ✓")


Loading cleaned dataset...
Creating date features...
Adding brand/city frequency encoding...
Aggregating to daily demand...
Category and country rolling windows...
Creating lag features...
Creating rolling-window features...
Applying time-based split...
Applying TOP-N product grouping...
Applying frequency encoding...
Applying target encoding...
Label encoding columns...
Dropping NA rows...
Saving final engineered datasets...

Feature Engineering v2 Complete ✓
