
# Retail Forecasting – Analysis & Plots

This notebook generates portfolio-ready visuals from your pipeline outputs.

**Expected files (produced by your pipeline):**
- `data/processed/features.parquet`
- `data/reports/predictions.csv`
- (Optional) `data/reports/elasticity_by_sku.csv`

**Outputs (images saved to `data/reports/`):**
- `sales_trend.png`
- `actual_vs_pred.png`
- `total_series.png`
- `error_hist.png`
- `elasticity_scatter.png`


In [None]:

import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DATA = Path("data")
PROC = DATA / "processed"
REPORTS = DATA / "reports"
REPORTS.mkdir(parents=True, exist_ok=True)

print("Looking for:", PROC / "features.parquet")
print("Looking for:", REPORTS / "predictions.csv")


In [None]:

# --- Sales Trend for a representative product-store pair ---
features_fp = PROC / "features.parquet"
if features_fp.exists():
    df = pd.read_parquet(features_fp)
    # pick the top (product_id, store_id) by total units
    if {"product_id","store_id","date","units"}.issubset(df.columns):
        top_pair = (df.groupby(["product_id","store_id"])["units"]
                      .sum().sort_values(ascending=False).head(1).index[0])
        sample = df[(df["product_id"]==top_pair[0]) & (df["store_id"]==top_pair[1])].copy()
        sample = sample.sort_values("date")
        plt.figure(figsize=(10,4))
        plt.plot(pd.to_datetime(sample["date"]), sample["units"], label="Units Sold")
        plt.title(f"Sales Trend – product {top_pair[0]}, store {top_pair[1]}")
        plt.xlabel("Date"); plt.ylabel("Units"); plt.legend()
        out = REPORTS / "sales_trend.png"
        plt.savefig(out, bbox_inches="tight", dpi=150)
        plt.show()
        print("Saved:", out)
    else:
        print("features.parquet missing required columns.")
else:
    print("features.parquet not found. Run your pipeline up to feature engineering.")


In [None]:

# --- Actual vs Predicted scatter ---
pred_fp = REPORTS / "predictions.csv"
if pred_fp.exists():
    pred = pd.read_csv(pred_fp)
    if {"units","pred_units"}.issubset(pred.columns):
        plt.figure(figsize=(6,6))
        plt.scatter(pred["units"], pred["pred_units"], alpha=0.3)
        lo = 0
        hi = max(pred["units"].max(), pred["pred_units"].max())
        plt.plot([lo, hi], [lo, hi], linestyle="--")
        plt.title("Actual vs Predicted Demand")
        plt.xlabel("Actual Units"); plt.ylabel("Predicted Units")
        out = REPORTS / "actual_vs_pred.png"
        plt.savefig(out, bbox_inches="tight", dpi=150)
        plt.show()
        print("Saved:", out)
    else:
        print("predictions.csv missing required columns.")
else:
    print("predictions.csv not found. Run training + scoring first.")


In [None]:

# --- Total Actual vs Predicted over time ---
pred_fp = REPORTS / "predictions.csv"
if pred_fp.exists():
    pred = pd.read_csv(pred_fp, parse_dates=["date"])
    if {"date","units","pred_units"}.issubset(pred.columns):
        ts = pred.groupby("date")[["units","pred_units"]].sum().sort_index()
        ax = ts.plot(figsize=(10,4))
        ax.set_title("Total Demand: Actual vs Predicted (daily)")
        ax.set_xlabel("Date"); ax.set_ylabel("Units")
        out = REPORTS / "total_series.png"
        plt.tight_layout(); plt.savefig(out, bbox_inches="tight", dpi=150)
        plt.show()
        print("Saved:", out)
    else:
        print("predictions.csv missing date/units columns.")
else:
    print("predictions.csv not found.")


In [None]:

# --- Error distribution histogram ---
pred_fp = REPORTS / "predictions.csv"
if pred_fp.exists():
    pred = pd.read_csv(pred_fp)
    if {"units","pred_units"}.issubset(pred.columns):
        pred = pred.copy()
        pred["error"] = pred["pred_units"] - pred["units"]
        ax = pred["error"].hist(bins=40, figsize=(8,4))
        ax.set_title("Prediction Error Distribution (pred - actual)")
        ax.set_xlabel("Error"); ax.set_ylabel("Count")
        out = REPORTS / "error_hist.png"
        plt.tight_layout(); plt.savefig(out, bbox_inches="tight", dpi=150)
        plt.show()
        print("Saved:", out)
    else:
        print("predictions.csv missing required columns.")
else:
    print("predictions.csv not found.")


In [None]:

# --- Price Elasticity (log-log scatter) for a representative SKU-store ---
# Try to load elasticity file; if missing, compute from features/sales.
elas_fp = REPORTS / "elasticity_by_sku.csv"
features_fp = PROC / "features.parquet"
sales_fp = PROC / "sales_clean.parquet"

def pick_pair(df):
    key = df.groupby(["product_id","store_id"])["units"].sum().sort_values(ascending=False).head(1).index[0]
    return key

if elas_fp.exists():
    elas = pd.read_csv(elas_fp)
    print("Loaded elasticity_by_sku.csv with", len(elas), "rows")
    # If you want, you can plot distribution of elasticity values here.
elif features_fp.exists():
    feat = pd.read_parquet(features_fp)
    have_price = "price" in feat.columns
    if not have_price and sales_fp.exists():
        sales = pd.read_parquet(sales_fp)[["product_id","store_id","date","price","units"]]
        feat = feat.merge(sales, on=["product_id","store_id","date"], how="left", suffixes=("","_sales"))
        if "price" not in feat.columns and "price_sales" in feat.columns:
            feat["price"] = feat["price_sales"]

    if {"product_id","store_id","date","units","price"}.issubset(feat.columns):
        pair = pick_pair(feat)
        g = feat[(feat["product_id"]==pair[0]) & (feat["store_id"]==pair[1])].dropna(subset=["units","price"]).copy()
        g = g[(g["units"]>0) & (g["price"]>0)]
        if len(g) >= 20:
            g["lu"] = np.log(g["units"])
            g["lp"] = np.log(g["price"])
            # OLS fit
            X = np.c_[np.ones(len(g)), g["lp"].to_numpy()]
            y = g["lu"].to_numpy()
            b = np.linalg.lstsq(X, y, rcond=None)[0]  # intercept, slope
            slope = b[1]
            plt.figure(figsize=(6,5))
            plt.scatter(g["lp"], g["lu"], alpha=0.5)
            xline = np.linspace(g["lp"].min(), g["lp"].max(), 100)
            yline = b[0] + slope * xline
            plt.plot(xline, yline, linestyle="--")
            plt.title(f"Log-Log Price vs Units (elasticity ≈ {slope:.2f})\nproduct {pair[0]}, store {pair[1]}")
            plt.xlabel("log(price)"); plt.ylabel("log(units)")
            out = REPORTS / "elasticity_scatter.png"
            plt.savefig(out, bbox_inches="tight", dpi=150)
            plt.show()
            print("Saved:", out)
        else:
            print("Not enough observations for selected pair to estimate elasticity.")
    else:
        print("Missing required columns to compute elasticity.")
else:
    print("No features.parquet or elasticity file found.")
