In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
"""
Stage 1: Baselines & Classical ML
=============================================================================
Predict next-day log return using pre-processed panel data.
Models: Persistence, Rolling Mean, Linear Regression, Gradient Boosting, ARIMA.
"""

# =============================================================================
# 1. Load Processed Data
# =============================================================================
DATA_PATH = "../data/processed/stock_data_processed.parquet"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Processed data not found at {DATA_PATH}. Please run data_download_colab.ipynb first.")

panel = pd.read_parquet(DATA_PATH)
print(f"  ✓ Loaded panel data with shape: {panel.shape}")
print(f"  ✓ Date range: {panel.index.get_level_values('date').min()} to {panel.index.get_level_values('date').max()}")

# =============================================================================
# 2. Configuration & Helper Functions
# =============================================================================
TRAIN_RATIO = 0.8
ROLLING_WIN = 20
FEATURE_COLS = [
    "ret_lag1", "ret_lag2", "ret_lag5",
    "roll_vol", "range_norm", "vol_zscore", "mkt_return",
]

def compute_metrics(y_true, y_pred):
    """Return MAE, RMSE, and directional accuracy (zeros excluded)."""
    yt = np.asarray(y_true, dtype=float)
    yp = np.asarray(y_pred, dtype=float)
    mae  = mean_absolute_error(yt, yp)
    rmse = np.sqrt(mean_squared_error(yt, yp))
    mask = yt != 0
    dacc = (
        np.mean(np.sign(yt[mask]) == np.sign(yp[mask]))
        if mask.any() else np.nan
    )
    return mae, rmse, dacc

def fit_best_arima(y_train, max_p=3, max_q=3):
    """Grid-search ARIMA(p,0,q) by AIC. Returns best fitted model or None."""
    best_aic, best_fit = np.inf, None
    for p in range(max_p + 1):
        for q in range(max_q + 1):
            if p == 0 and q == 0: continue
            try:
                fit = ARIMA(y_train, order=(p, 0, q)).fit()
                if fit.aic < best_aic:
                    best_aic, best_fit = fit.aic, fit
            except Exception: continue
    return best_fit

# =============================================================================
# 3. Model Evaluation
# =============================================================================
# SPY excluded as prediction target (mkt_return would be its own lagged return)
eval_tickers = [
    t for t in panel.index.get_level_values("ticker").unique() if t != "SPY"
]

results = []

for ticker in tqdm(eval_tickers, desc="Evaluating Models"):
    tk = panel.xs(ticker, level="ticker")
    if len(tk) < 100: continue

    X, y = tk[FEATURE_COLS], tk["log_return"]
    n    = int(len(X) * TRAIN_RATIO)
    Xtr, Xte = X.iloc[:n], X.iloc[n:]
    ytr, yte = y.iloc[:n], y.iloc[n:]

    preds = {}
    
    # Baseline — persistence (yesterday's return)
    preds["Persistence"] = tk["ret_lag1"].loc[yte.index]

    # Baseline — 20-day rolling mean (lagged by 1)
    preds["Rolling Mean"] = (
        tk["log_return"].rolling(ROLLING_WIN).mean().shift(1).loc[yte.index]
    )

    # Linear Regression
    lr = LinearRegression().fit(Xtr, ytr)
    preds["Linear Reg"] = pd.Series(lr.predict(Xte), index=yte.index)

    # Gradient Boosting
    hgb = HistGradientBoostingRegressor(max_iter=200, random_state=42)
    hgb.fit(Xtr, ytr)
    preds["HistGB"] = pd.Series(hgb.predict(Xte), index=yte.index)

    # ARIMA — single fit, multi-step forecast
    arima_fit = fit_best_arima(ytr)
    if arima_fit is not None:
        preds["ARIMA"] = pd.Series(
            arima_fit.forecast(steps=len(yte)).values, index=yte.index
        )
    else:
        preds["ARIMA"] = pd.Series(0.0, index=yte.index)

    # Record metrics
    for name, yp in preds.items():
        mae, rmse, dacc = compute_metrics(yte, yp)
        results.append(dict(
            Ticker=ticker, Model=name,
            MAE=round(mae, 6), RMSE=round(rmse, 6), DirAcc=round(dacc, 4),
        ))

# =============================================================================
# 4. Results & Visualization
# =============================================================================
res = pd.DataFrame(results)

# Aggregate summary
summary = res.groupby("Model")[["MAE", "RMSE", "DirAcc"]].agg(["mean", "std"]).round(4)
print("\n" + "=" * 65)
print("  MODEL COMPARISON  (mean ± std across tickers)")
print("=" * 65)
print(summary)

# Best model per ticker
best = res.loc[res.groupby("Ticker")["RMSE"].idxmin()]
print("\n" + "=" * 65)
print("  BEST MODEL PER TICKER  (lowest RMSE)")
print("=" * 65)
print(best[["Ticker", "Model", "RMSE", "DirAcc"]].to_string(index=False))

# Plotting
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
for ax, col, title in zip(
    axes, ["MAE", "RMSE", "DirAcc"], 
    ["Mean Absolute Error ↓", "Root Mean Squared Error ↓", "Directional Accuracy ↑"]
):
    means = res.groupby("Model")[col].mean().sort_values(ascending=(col != "DirAcc"))
    means.plot.barh(ax=ax, color="steelblue", edgecolor="black")
    ax.set_title(title)
    ax.set_xlabel(col)

plt.suptitle("Stage 1 — Baseline & Classical ML (Processed Data)", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()