# Demand Forecasting (XGBoost)

End-to-end workflow: load data → EDA → feature engineering → train/test split → model training → evaluation → forecast & plot.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pathlib import Path
import json

data_path = '../data/example_demand.csv'
df = pd.read_csv(data_path, parse_dates=['date']).sort_values('date')
df.head()

In [None]:
df.set_index('date')['demand'].plot(figsize=(10,4), title='Demand over time'); plt.show()

In [None]:
def add_calendar_features(df):
    df = df.copy()
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    return df

def add_lag_features(df, target='demand', lags=(1,7,14,28)):
    df = df.copy()
    for lag in lags:
        df[f'lag_{lag}'] = df[target].shift(lag)
    return df

def add_rolling_features(df, target='demand', windows=(7,14,28)):
    df = df.copy()
    for w in windows:
        df[f'rollmean_{w}'] = df[target].shift(1).rolling(w).mean()
        df[f'rollstd_{w}'] = df[target].shift(1).rolling(w).std()
    return df

def make_features(df):
    df = add_calendar_features(df)
    df = add_lag_features(df)
    df = add_rolling_features(df)
    return df

feat = make_features(df)
feat = feat.dropna().reset_index(drop=True)
feature_cols = [c for c in feat.columns if c not in ['date','sku','demand']]
X, y = feat[feature_cols], feat['demand']
len(X), len(y)

In [None]:
test_size = max(60, len(X)//5)
split = len(X)-test_size
X_train, y_train = X.iloc[:split], y.iloc[:split]
X_test, y_test = X.iloc[split:], y.iloc[split:]

model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.9, colsample_bytree=0.9, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred, squared=False)
mape = (np.abs((y_test - pred) / np.clip(y_test, 1e-6, None))).mean()*100
mae, rmse, mape

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,4))
plt.plot(y_test.values, label='Actual')
plt.plot(pred, label='Predicted')
plt.title('Test set: actual vs predicted')
plt.legend()
plt.show()

In [None]:
import json, pandas as pd
from pathlib import Path
import numpy as np

# Simple recursive forecast for next 30 days
horizon = 30
last = df.copy()
future_rows = []
for h in range(1, horizon+1):
    next_date = last['date'].iloc[-1] + pd.Timedelta(days=1)
    new_row = {'date': next_date, 'sku': last['sku'].iloc[-1], 'price': last['price'].iloc[-1], 'promo': 0}
    last = pd.concat([last, pd.DataFrame([new_row])], ignore_index=True)
    tmp = make_features(last)
    tmp_feat = tmp.dropna().iloc[[-1]][feature_cols]
    yhat = float(model.predict(tmp_feat)[0])
    last.loc[last.index[-1], 'demand'] = max(0, yhat)
    future_rows.append({'date': next_date.date().isoformat(), 'forecast': round(max(0,yhat),2)})

forecast_df = pd.DataFrame(future_rows)
forecast_df.to_csv('../reports/forecast_from_notebook.csv', index=False)
forecast_df.head()
