# 4 — Baseline_Signal_Modeling

Purpose:
- Train baseline supervised models (Ridge + RandomForest) on VinV monthly features
- Use time-ordered splits (walk-forward-style via TimeSeriesSplit)
- Log metrics + save artifacts for Walk-Forward Validation

Inputs:
- ../data/processed/vinv_inputs_raw.parquet
- ../data/processed/vinv_targets_monthly.parquet

Outputs:
- ../artifacts/model_metrics.csv
- ../artifacts/model_runs.csv
- ../artifacts/models/linear_model.pkl
- ../artifacts/models/tree_model.pkl
- ../artifacts/feature_columns.json


In [47]:
# ============================================================
# VinV – Baseline Signal Modeling
# Linear baseline for Value-in-Vogue signal quality
# ============================================================

from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, accuracy_score
from sklearn.model_selection import TimeSeriesSplit

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
ROOT = Path("..")
DATA_PROCESSED = ROOT / "data" / "processed"
ARTIFACTS = ROOT / "artifacts"

IN_FEATURES = DATA_PROCESSED / "vinv_inputs_raw.parquet"
IN_TARGETS  = DATA_PROCESSED / "vinv_targets_monthly.parquet"

OUT_COEF = ARTIFACTS / "baseline_model_coefficients.csv"
OUT_PERF = ARTIFACTS / "baseline_model_performance.csv"

In [48]:
# Robust reader
def robust_read(path: Path) -> pd.DataFrame:
    """
    Tries combinations of encoding + separator common in Windows/Excel exports.
    Returns first parse that yields >= 2 columns and >= 1 row.
    """
    encodings = ["utf-8", "utf-8-sig", "latin1", "utf-16", "utf-16le"]
    seps = [None, "\t", ",", "|", r"\s+"]

    last_err = None
    for enc in encodings:
        for sep in seps:
            try:
                if sep is None:
                    df = pd.read_csv(path, sep=None, engine="python", encoding=enc)
                else:
                    df = pd.read_csv(path, sep=sep, engine="python", encoding=enc)

                if df is not None and df.shape[0] > 0 and df.shape[1] >= 2:
                    return df
            except Exception as e:
                last_err = e
                continue

    raise ValueError(f"robust_read failed | last_err={str(last_err)[:200]}")

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
    return df

def find_date_col(df: pd.DataFrame) -> str:
    cols_lower = {c.lower(): c for c in df.columns}
    for k in ["date", "datetime", "timestamp"]:
        if k in cols_lower:
            return cols_lower[k]

    first = df.columns[0]
    if "unnamed" in str(first).lower():
        return first

    # fallback: pick column with most datetime-parse successes
    best = None
    best_ok = 0
    for c in df.columns:
        s = pd.to_datetime(df[c], errors="coerce")
        ok = int(s.notna().sum())
        if ok > best_ok:
            best_ok = ok
            best = c
    if best is None or best_ok == 0:
        raise ValueError("No parseable date-like column found.")
    return best

def coerce_numeric(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

In [49]:
# Load data
Xdf = pd.read_parquet(IN_FEATURES)
ydf = pd.read_parquet(IN_TARGETS)

df = Xdf.merge(
    ydf,
    on=["date", "asset_id"],
    how="inner"
).sort_values(["date", "asset_id"]).reset_index(drop=True)

if len(df) == 0:
    raise ValueError("Merged training frame is empty.")

In [50]:
# Feature / target selection
FEATURE_COLS = [
    "mom_1m",
    "mom_3m",
    "mom_12m",
    "div_cash_m",
    "ret_m_std",
    "vol_m_sum",
]

FEATURE_COLS = [c for c in FEATURE_COLS if c in df.columns]

TARGET_REG = "target_value_spread_fwd_1m"
TARGET_CLS = "target_outperform_flag_1m"

X = df[FEATURE_COLS]
y_reg = df[TARGET_REG]
y_cls = df[TARGET_CLS]

In [51]:
# Time-series CV
tscv = TimeSeriesSplit(n_splits=5)

rows = []

for fold, (tr, te) in enumerate(tscv.split(X), start=1):
    Xtr, Xte = X.iloc[tr], X.iloc[te]
    ytr_r, yte_r = y_reg.iloc[tr], y_reg.iloc[te]
    ytr_c, yte_c = y_cls.iloc[tr], y_cls.iloc[te]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", Ridge(alpha=1.0)),
    ])

    pipe.fit(Xtr, ytr_r)
    pred = pipe.predict(Xte)

    r2 = r2_score(yte_r, pred)
    acc = accuracy_score(yte_c, pred > 0)

    rows.append({
        "fold": fold,
        "r2_reg": r2,
        "accuracy_cls": acc,
    })

perf = pd.DataFrame(rows)
perf.to_csv(OUT_PERF, index=False)

print("Wrote:", OUT_PERF)
display(perf)

Wrote: ..\artifacts\baseline_model_performance.csv


Unnamed: 0,fold,r2_reg,accuracy_cls
0,1,-0.003052,0.500564
1,2,0.001947,0.499514
2,3,3.9e-05,0.500253
3,4,-0.000229,0.499835
4,5,0.001032,0.500321


In [52]:
# Fit final model on full sample
final_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(alpha=1.0)),
])

final_pipe.fit(X, y_reg)

coefs = pd.Series(
    final_pipe.named_steps["model"].coef_,
    index=FEATURE_COLS,
    name="coefficient"
).sort_values(ascending=False)

coefs.to_csv(OUT_COEF)

print("Wrote:", OUT_COEF)
display(coefs)

Wrote: ..\artifacts\baseline_model_coefficients.csv


ret_m_std     0.013672
mom_12m       0.001687
mom_3m        0.001354
vol_m_sum     0.001221
div_cash_m    0.000478
mom_1m       -0.012886
Name: coefficient, dtype: float64