In [8]:
import os, glob, re, math
import numpy as np
import pandas as pd


In [16]:
# Simple normal CDF (no SciPy required)
def normal_cdf(x):
    return 0.5 * (1 + math.erf(x / math.sqrt(2)))

def diebold_mariano(e1, e2, h=1, power=1):
    e1, e2 = np.asarray(e1), np.asarray(e2)
    d = np.abs(e1)**power - np.abs(e2)**power
    T = len(d)
    mean_d = np.mean(d)

    # Newey–West variance with truncation lag h-1
    var_d = np.var(d, ddof=1)
    for lag in range(1, h):
        cov = np.cov(d[:-lag], d[lag:])[0,1]
        var_d += 2*(1 - lag/h)*cov
    var_d /= T

    dm_stat = mean_d / math.sqrt(var_d)
    p_value = 2 * (1 - normal_cdf(abs(dm_stat)))
    return dm_stat, p_value


In [20]:
DATA_DIR = "."

# Regex for your filenames
filename_re = re.compile(
    r'^(?P<prefix>20\d{2})?'        # optional 2020 prefix
    r'(?P<model>[A-Za-z]+)'         # model (rf, xgb, lstm…)
    r'(?P<horizon>\d+)?'            # optional horizon (1,5,21)
    r'(?P<vix>_vix)?$'              # optional _vix
)

def parse_filename(fname):
    name = os.path.splitext(os.path.basename(fname))[0]
    m = filename_re.match(name)
    if not m:
        return None

    sample  = m.group("prefix") if m.group("prefix") else "full"
    model   = m.group("model").lower()
    horizon = int(m.group("horizon")) if m.group("horizon") else 1
    vix     = m.group("vix") is not None

    return sample, model, horizon, vix

# Load all CSVs into a dictionary by key (sample, model, horizon, vix)
files = {}

for path in glob.glob("*.csv"):
    parsed = parse_filename(path)
    if parsed is None:
        continue

    sample, model, horizon, vix = parsed
    df = pd.read_csv(path, parse_dates=["date"]).sort_values("date")

    files[(sample, model, horizon, vix)] = df

len(files)


36

In [21]:
tests = []

models   = ["rf", "xgb", "lstm"]
samples  = ["full", "2020"]
horizons = [1, 5, 21]

for model in models:
    for sample in samples:
        for horizon in horizons:

            key_base = (sample, model, horizon, False)
            key_vix  = (sample, model, horizon, True)

            if key_base not in files or key_vix not in files:
                print("Missing:", key_base, key_vix)
                continue

            df_b = files[key_base]
            df_v = files[key_vix]

            merged = pd.merge(
                df_b[["date", "actual", "pred"]].rename(columns={"pred": "pred_base"}),
                df_v[["date", "actual", "pred"]].rename(columns={"pred": "pred_vix"}),
                on="date",
                how="inner",
                suffixes=("_b", "_v")
            )

            # average actuals if suffix columns exist
            if "actual_b" in merged.columns:
                actual = 0.5*(merged["actual_b"] + merged["actual_v"])
            else:
                actual = merged["actual"]

            e_base = actual - merged["pred_base"]
            e_vix  = actual - merged["pred_vix"]

            dm_stat, pval = diebold_mariano(e_base, e_vix, h=horizon)

            # Determine winner
            if pval < 0.05:
                if dm_stat > 0:
                    winner = "vix"
                else:
                    winner = "baseline"
            else:
                winner = "no significant diff"

            tests.append({
                "sample": sample,
                "model": model,
                "horizon": horizon,
                "dm_stat": dm_stat,
                "p_value": pval,
                "better_model": winner,
                "n_obs": len(merged)
            })

dm_results = pd.DataFrame(tests)
dm_results


Unnamed: 0,sample,model,horizon,dm_stat,p_value,better_model,n_obs
0,full,rf,1,-0.920718,0.3571978,no significant diff,501
1,full,rf,5,-3.425672,0.0006132814,baseline,481
2,full,rf,21,-1.927545,0.05391178,no significant diff,481
3,2020,rf,1,1.217012,0.2235996,no significant diff,501
4,2020,rf,5,0.978936,0.3276117,no significant diff,481
5,2020,rf,21,-1.848947,0.06446541,no significant diff,481
6,full,xgb,1,-3.987704,6.671569e-05,baseline,501
7,full,xgb,5,-5.696723,1.221324e-08,baseline,481
8,full,xgb,21,-2.645306,0.008161702,baseline,481
9,2020,xgb,1,0.514309,0.6070358,no significant diff,501


In [22]:
dm_results.sort_values(["sample", "model", "horizon"])


Unnamed: 0,sample,model,horizon,dm_stat,p_value,better_model,n_obs
15,2020,lstm,1,-2.999889,0.002700776,baseline,501
16,2020,lstm,5,0.370273,0.7111794,no significant diff,481
17,2020,lstm,21,-2.461551,0.01383377,baseline,481
3,2020,rf,1,1.217012,0.2235996,no significant diff,501
4,2020,rf,5,0.978936,0.3276117,no significant diff,481
5,2020,rf,21,-1.848947,0.06446541,no significant diff,481
9,2020,xgb,1,0.514309,0.6070358,no significant diff,501
10,2020,xgb,5,0.429404,0.6676293,no significant diff,481
11,2020,xgb,21,-1.19481,0.2321611,no significant diff,481
12,full,lstm,1,-3.120168,0.001807479,baseline,501
