In [38]:
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import warnings
from itertools import combinations

# Ensure project root is on the import path
sys.path.insert(0, os.path.abspath('.'))

# Suppress warnings
warnings.filterwarnings("ignore")

# Imports
from Helper.evaluation_helpers import get_predictions
from Helper.DM_Test import dm_test
from Training.Helper.PyTorchModular import HORIZONS

# Setup project paths
PROJECT_ROOT = Path().resolve().parents[0]
PRED_BASE = PROJECT_ROOT / 'Predictions'
SAVE_DIR = PROJECT_ROOT / 'Evaluation' / 'dm_test_results'
SAVE_DIR.mkdir(parents=True, exist_ok=True)

print("Running Diebold-Mariano tests on model predictions:")


for h in HORIZONS:
    preds_dir = PRED_BASE / f"Horizon{h}"
    print(f"\nLooking in: {preds_dir} — Exists? {preds_dir.exists()}")

    if not preds_dir.exists():
        print(f"Horizon {h}: prediction folder not found — skipping.")
        continue

    preds_df = get_predictions(preds_dir)

    # Separate ground truth and predictions
    ground_truth = preds_df['ground_truth']
    models = preds_df.columns.drop('ground_truth')
    print(ground_truth)
    print(f"Horizon {h} — {len(models)} models: {models}")

    if len(models) < 2:
        print(f"Horizon {h}: not enough models to compare — skipping.")
        continue

    results = []
    # Run DM test for each unique pair to prevent duplicate
    for model1 in models:
        if model1 == "Naive":
            continue
        
        dm_stat, p_val = dm_test(ground_truth, preds_df[model1], preds_df["Naive"], h=1, crit="MAPE")
        #print(model1,dm_stat)
        if p_val < 0.05:
            better_model = "Naive" if dm_stat > 0 else model1
        else:
            better_model = "No significant difference"
        
        results.append((model1, "Naive", dm_stat, p_val, better_model))
        
    if not results:
        print(f"No valid DM tests for Horizon {h}.")
        continue

    # Create results DataFrame
    results_df = pd.DataFrame(results, columns=["Model_1", "Model_2", "DM_statistic", "p_value", "Better_Model"])
    results_df = results_df.sort_values(by="p_value").reset_index(drop=True)
    results_df.index += 1  # Start rank from 1
    results_df.insert(0, "Rank", results_df.index)

    # Save full ranked results
    save_path = SAVE_DIR / f"Horizon{h}_DM_results.csv"
    results_df.to_csv(save_path, index=False)
    print(f"Saved DM test ranked results: {save_path}")

    # Count how many times each model wins
    better_counts = results_df['Better_Model'].value_counts().reset_index()
    better_counts.columns = ['Model', 'Wins']
    better_counts = better_counts.sort_values(by="Wins", ascending=False).reset_index(drop=True)
    better_counts.index += 1  # Start rank from 1
    better_counts.insert(0, "Rank", better_counts.index)

    # Save better model rankings
    better_save_path = SAVE_DIR / f"Horizon{h}_Better_Model_Ranking.csv"
    better_counts.to_csv(better_save_path, index=False)
    print(f"Saved better model ranking: {better_save_path}")


Running Diebold-Mariano tests on model predictions:

Looking in: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Predictions\Horizon1 — Exists? True
observation_date
01/2024    122.115
02/2024    122.494
03/2024    122.912
04/2024    123.234
05/2024    123.224
06/2024    123.369
07/2024    123.575
08/2024    123.727
09/2024    123.939
10/2024    124.235
11/2024    124.387
12/2024    124.705
Name: ground_truth, dtype: float64
Horizon 1 — 20 models: Index(['ARDL', 'ARIMA1990', 'ARIMAX1990', 'GRU1990', 'LSTM', 'MARS', 'MLR',
       'Naive', 'NBEATSx', 'NHITS', 'RFX', 'RNN1990', 'SARIMA1990',
       'SARIMAX1990', 'TCN', 'TFT', 'Tide', 'VARccf', 'VARcointegration',
       'XGB1990'],
      dtype='object')
Saved DM test ranked results: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon1_DM_results.csv
Saved better model ranking: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon1_Better_Mo

In [31]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path

# Setup
PROJECT_ROOT = Path().resolve().parents[0]
SAVE_DIR = PROJECT_ROOT / 'Evaluation' / 'dm_test_results'
HORIZONS = [1, 3, 6, 12]

# Plot Heatmap of p-values
def plot_pvalue_heatmap(results_df, horizon_name):
    pivot = results_df.pivot(index='Model_1', columns='Model_2', values='p_value')
    combined = pivot.combine_first(pivot.T)  # Symmetrize: fill missing by opposite direction

    plt.figure(figsize=(14, 12))
    sns.heatmap(
        combined,
        cmap="coolwarm",
        center=0.05,
        linewidths=0.5,
        linecolor='gray',
        cbar_kws={'label': 'p-value'},
        annot=False  # Optional: set True if you want the number shown
    )
    plt.title(f"Diebold-Mariano p-value Heatmap — {horizon_name}")
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()

    save_path = SAVE_DIR / f"{horizon_name}_pvalue_heatmap.png"
    plt.savefig(save_path)
    plt.close()
    print(f"Saved p-value heatmap: {save_path}")

# Plot Bar chart of model win counts
def plot_win_counts(count_df, horizon_name):
    plt.figure(figsize=(12, 6))
    plt.bar(count_df['Model'], count_df['Wins'], color='skyblue')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Number of Wins")
    plt.title(f"Better Model Wins — {horizon_name}")
    plt.tight_layout()

    save_path = SAVE_DIR / f"{horizon_name}_model_wins_bar.png"
    plt.savefig(save_path)
    plt.close()
    print(f"Saved better model wins bar plot: {save_path}")

# Main loop
for h in HORIZONS:
    results_path = SAVE_DIR / f"Horizon{h}_DM_results.csv"
    wins_path = SAVE_DIR / f"Horizon{h}_Better_Model_Ranking.csv"

    if not results_path.exists() or not wins_path.exists():
        print(f"Horizon{h}: missing results — skipping.")
        continue

    # Load CSVs
    results_df = pd.read_csv(results_path)
    wins_df = pd.read_csv(wins_path)

    if not results_df.empty:
        plot_pvalue_heatmap(results_df, f"Horizon{h}")

    if not wins_df.empty:
        plot_win_counts(wins_df, f"Horizon{h}")


Saved p-value heatmap: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon1_pvalue_heatmap.png
Saved better model wins bar plot: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon1_model_wins_bar.png
Saved p-value heatmap: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon3_pvalue_heatmap.png
Saved better model wins bar plot: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon3_model_wins_bar.png
Saved p-value heatmap: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon6_pvalue_heatmap.png
Saved better model wins bar plot: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon6_model_wins_bar.png
Saved p-value heatmap: C:\Users\James\COMP5530M-Group-Project-Inflation-Forecasting\Evaluation\dm_test_results\Horizon12_pvalue_heatm