In [1]:
import os
import json
import yaml
import joblib
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix

# ------------------------------------------------------------
# Utility: ensure output directory exists
# ------------------------------------------------------------
def ensure_outdir(path):
    outdir = Path(path)
    outdir.mkdir(parents=True, exist_ok=True)
    return outdir

# ------------------------------------------------------------
# Load data
# ------------------------------------------------------------
def load_data(csv_path):
    df = pd.read_csv(csv_path, parse_dates=["date"], infer_datetime_format=True)
    print(f"[INFO] Loaded {csv_path} with shape {df.shape}")
    return df

# ------------------------------------------------------------
# Feature Engineering
# ------------------------------------------------------------
def make_features(df):
    df = df.sort_values(["sku", "location", "date"]).copy()

    # Rolling demand
    df["demand_7d"] = df.groupby(["sku","location"])["sales"].transform(lambda x: x.rolling(7, min_periods=1).mean())
    df["demand_14d"] = df.groupby(["sku","location"])["sales"].transform(lambda x: x.rolling(14, min_periods=1).mean())
    df["demand_28d"] = df.groupby(["sku","location"])["sales"].transform(lambda x: x.rolling(28, min_periods=1).mean())

    # Volatility (std/mean)
    df["cv_14d"] = (
        df.groupby(["sku","location"])["sales"]
        .transform(lambda x: x.rolling(14, min_periods=1).std())
        / (df["demand_14d"]+1e-6)
    )

    # Days of cover
    df["days_of_cover"] = df["on_hand"] / (df["demand_14d"] + 1e-6)

    # Stockout label
    df["stockout_flag"] = (df["on_hand"] <= 0).astype(int)

    # Fill NaNs
    df = df.fillna(0)
    return df

# ------------------------------------------------------------
# Demand Forecasting Model (regression)
# ------------------------------------------------------------
def train_demand_model(df):
    feats = ["demand_7d","demand_14d","demand_28d","cv_14d","days_of_cover","on_hand","on_order"]
    X = df[feats]
    y = df["sales"]

    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    pipe.fit(Xtr, ytr)

    preds = pipe.predict(Xte)
    mae = mean_absolute_error(yte, preds)
    rmse = mean_squared_error(yte, preds, squared=False)
    print(f"[DEMAND] MAE={mae:.2f} RMSE={rmse:.2f}")

    return pipe, {"mae": float(mae), "rmse": float(rmse)}

# ------------------------------------------------------------
# Stockout Risk Model (classification)
# ------------------------------------------------------------
def train_stockout_model(df):
    feats = ["demand_7d","demand_14d","demand_28d","cv_14d","days_of_cover","on_hand","on_order"]
    X = df[feats]
    y = df["stockout_flag"]

    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    pipe.fit(Xtr, ytr)

    preds = pipe.predict(Xte)
    acc = accuracy_score(yte, preds)
    cm = confusion_matrix(yte, preds)
    print(f"[STOCKOUT] Accuracy={acc:.2f}")

    return pipe, {"accuracy": float(acc), "confusion_matrix": cm.tolist()}

# ------------------------------------------------------------
# Save Visualizations
# ------------------------------------------------------------
def save_visuals(df, outdir):
    # Stockout risk heatmap
    pivot = df.pivot_table(index="location", columns="sku", values="stockout_flag", aggfunc="mean")
    plt.figure(figsize=(10,6))
    sns.heatmap(pivot, cmap="Reds", annot=False)
    plt.title("Stockout Risk Heatmap")
    plt.tight_layout()
    plt.savefig(outdir / "viz_stockout_risk_heatmap.png")
    plt.close()

    # Service level vs cost (toy plot)
    service = np.linspace(80,99,20)
    cost = 1000/(100-service)  # fake convex curve
    plt.plot(service, cost, marker="o")
    plt.xlabel("Service Level (%)")
    plt.ylabel("Relative Inventory Cost")
    plt.title("Service Level vs Cost")
    plt.savefig(outdir / "viz_service_vs_cost.png")
    plt.close()

    # Forecast example for first SKU
    sku0 = df["sku"].iloc[0]
    sub = df[df["sku"]==sku0].sort_values("date").tail(60)
    plt.plot(sub["date"], sub["sales"], label="Sales")
    plt.plot(sub["date"], sub["demand_14d"], label="Rolling Forecast (14d)")
    plt.legend()
    plt.title(f"Demand Forecast – SKU {sku0}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(outdir / f"viz_demand_forecast_{sku0}.png")
    plt.close()

# ------------------------------------------------------------
# Main runner
# ------------------------------------------------------------
def main(args):
    outdir = ensure_outdir(args.outdir)

    df = load_data(args.data)
    df = make_features(df)

    # Train models
    demand_model, demand_metrics = train_demand_model(df)
    stockout_model, stockout_metrics = train_stockout_model(df)

    # Save models
    joblib.dump(demand_model, outdir / "demand_model.pkl")
    joblib.dump(stockout_model, outdir / "stockout_model.pkl")

    # Save features & predictions
    df.to_hdf(outdir / "processed_inventory.h5", key="data", mode="w")

    # Save insights.json
    insights = {
        "top_risky_skus": df.groupby("sku")["stockout_flag"].mean().sort_values(ascending=False).head(10).to_dict(),
        "demand_metrics": demand_metrics,
        "stockout_metrics": stockout_metrics
    }
    with open(outdir / "insights.json","w") as f:
        json.dump(insights, f, indent=2)

    # Save metadata.yaml
    metadata = {
        "input_file": str(args.data),
        "output_dir": str(outdir),
        "features": ["demand_7d","demand_14d","demand_28d","cv_14d","days_of_cover","on_hand","on_order"],
        "versions": {
            "pandas": pd.__version__,
            "numpy": np.__version__,
            "scikit-learn": "1.5+"
        }
    }
    with open(outdir / "build_metadata.yaml","w") as f:
        yaml.dump(metadata, f)

    # Visualizations
    save_visuals(df, outdir)

    print(f"[DONE] Artifacts saved to {outdir}")

# ------------------------------------------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, required=True, help="Path to CSV")
    parser.add_argument("--outdir", type=str, required=True, help="Output directory")
    args = parser.parse_args()
    main(args)


usage: ipykernel_launcher.py [-h] --data DATA --outdir OUTDIR
ipykernel_launcher.py: error: the following arguments are required: --data, --outdir


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
