In [5]:
# Importing required libraries
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from datetime import timedelta

# Reading data file
STAR = Path("../data/star_schema")
fact = pd.read_csv(STAR/'fact_sales.csv', parse_dates=['order_date'])


In [4]:
# model by Category × Region (adjust if you prefer)
keys = [k for k in ['category','region'] if k in fact.columns]
agg = fact.groupby(keys+['order_date']).agg(
    quantity=('quantity','sum'),
    sales=('sales','sum')
).reset_index()

def build_features(ts_df, target='quantity', date_col='order_date'):
    ts = ts_df.sort_values(date_col).copy()
    idx = pd.date_range(ts[date_col].min(), ts[date_col].max(), freq='D')
    base = pd.DataFrame({date_col: idx}).merge(ts[[date_col, target]], on=date_col, how='left')
    base[target] = base[target].fillna(0.0)
    base['year'] = base[date_col].dt.year
    base['month'] = base[date_col].dt.month
    base['dow'] = base[date_col].dt.weekday
    base['weekofyear'] = base[date_col].dt.isocalendar().week.astype(int)
    base['is_weekend'] = base['dow'].isin([5,6]).astype(int)
    for lag in [1,7,14,21,28]:
        base[f'lag_{lag}'] = base[target].shift(lag)
    for w in [7,14,28]:
        base[f'rollmean_{w}'] = base[target].shift(1).rolling(w).mean()
        base[f'rollstd_{w}']  = base[target].shift(1).rolling(w).std()
    return base.dropna().reset_index(drop=True)

In [6]:
# Train, evaluate, and export forecasts (for Quantity & Sales)
targets = ['quantity','sales']
metrics_list, fc_list = [], []
segs = agg[keys].drop_duplicates().to_dict(orient='records') if keys else [dict()]
forecast_h = 28

for target in targets:
    for seg in segs:
        sub = agg.copy()
        for k,v in seg.items(): sub = sub[sub[k]==v]
        ts = sub[['order_date', target]].groupby('order_date').sum().reset_index()
        if len(ts) < 160: 
            continue

        feats = build_features(ts, target=target)
        y = target
        X = [c for c in feats.columns if c not in ['order_date', y]]

        cut = int(len(feats)*0.8)
        tr, te = feats.iloc[:cut], feats.iloc[cut:]
        if len(tr)==0 or len(te)==0: 
            continue

        model = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
        model.fit(tr[X], tr[y])

        pred = model.predict(te[X])
        mae  = mean_absolute_error(te[y], pred)
        mape = float(np.mean(np.abs((te[y].values - pred) / np.where(te[y].values==0, 1, te[y].values))) * 100)

        metrics_list.append({**seg, 'target': target, 'train_n': len(tr), 'test_n': len(te), 'MAE': mae, 'MAPE_%': mape})

        # recursive 28-day forecast
        last_date = feats['order_date'].max()
        fc_base = feats.copy()
        for step in range(1, forecast_h+1):
            nd = last_date + pd.Timedelta(days=step)
            row = {'year': nd.year, 'month': nd.month, 'dow': nd.weekday(),
                   'weekofyear': int(pd.Timestamp(nd).isocalendar().week), 'is_weekend': 1 if nd.weekday() in [5,6] else 0}
            tmp = fc_base.set_index('order_date')[y]
            for lag in [1,7,14,21,28]:
                row[f'lag_{lag}'] = float(tmp.iloc[-lag]) if len(tmp)>lag else float(tmp.iloc[-1])
            for w in [7,14,28]:
                row[f'rollmean_{w}'] = float(tmp.iloc[-w:].mean()) if len(tmp)>=w else float(tmp.mean())
                row[f'rollstd_{w}']  = float(tmp.iloc[-w:].std()) if len(tmp)>=w else float(tmp.std())
            import pandas as pd
            row_df = pd.DataFrame([row])
            yhat = float(model.predict(row_df[X])[0])
            fc_base = pd.concat([fc_base, pd.DataFrame({'order_date':[nd], y:[yhat]})], ignore_index=True)
            fc_list.append({**seg, 'date': nd, 'target': target, 'yhat': yhat, 'model': 'RandomForestRegressor', 'horizon_day': step})

metrics = pd.DataFrame(metrics_list).sort_values(['target','MAPE_%'])
forecast = pd.DataFrame(fc_list)

metrics.to_csv("../data/model_metrics_ml.csv", index=False)
forecast.to_csv("../data/forecast_ml.csv", index=False)

metrics.head(10), forecast.head()

(             region    target  train_n  test_n        MAE      MAPE_%
 0            Canada  quantity     1144     287   1.628240  115.168639
 14  Southern Africa  quantity     1133     284   2.098565  134.230701
 2    Central Africa  quantity     1142     286   2.198409  136.021555
 18   Western Africa  quantity     1140     285   3.687132  167.143290
 5    Eastern Africa  quantity     1144     287   2.822840  167.815929
 19     Western Asia  quantity     1144     287   4.797805  183.174211
 9      North Africa  quantity     1146     287   3.755392  186.934079
 3   Central America  quantity     1144     287  12.147195  209.980510
 11          Oceania  quantity     1145     287   9.003397  233.659506
 7    Eastern Europe  quantity     1146     287   4.609800  233.960019,
    region       date    target    yhat                  model  horizon_day
 0  Canada 2015-12-31  quantity  1.5425  RandomForestRegressor            1
 1  Canada 2016-01-01  quantity  2.0100  RandomForestRegressor    