In [None]:
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import ttest_rel
import warnings

# Ensure project root is on the import path
sys.path.insert(0, os.path.abspath('.'))

# Suppress warnings
warnings.filterwarnings("ignore")

# Imports
from Helper.evaluation_helpers import get_predictions
from Training.Helper.PyTorchModular import HORIZONS

PROJECT_ROOT = Path().resolve().parents[0]
PRED_BASE = PROJECT_ROOT / 'Predictions'
SAVE_DIR = PROJECT_ROOT / 'Evaluation' / 't_test_results'
SAVE_DIR.mkdir(parents=True, exist_ok=True)

print("Running paired t-tests on model predictions (significant results will be ranked by p-value):")
for h in HORIZONS:
    preds_dir = PRED_BASE / f"Horizon{h}"
    print(f"\nLooking in: {preds_dir} — Exists? {preds_dir.exists()}")

    if not preds_dir.exists():
        print(f"Horizon {h}: prediction folder not found — skipping.")
        continue

    preds_df = get_predictions(preds_dir)

    # Drop ground truth if present
    preds = preds_df.drop(columns=['ground_truth'], errors='ignore')
    models = preds.columns.tolist()

    print(f"Horizon {h} — {len(models)} models: {models}")

    if len(models) < 2:
        print(f"Horizon {h}: not enough models to compare — skipping.")
        continue

    # Prepare full p-value matrix
    pvals = pd.DataFrame(np.nan, index=models, columns=models)

    # Run paired t-test
    for i in models:
        for j in models:
            if i == j:
                continue
            try:
                _, p = ttest_rel(preds[i], preds[j])
                pvals.loc[i, j] = p
            except Exception as e:
                print(f"Skipping {i} vs {j} due to error: {e}")

    # Save full matrix
    full_path = SAVE_DIR / f"Horizon{h}_full.csv"
    pvals.round(6).to_csv(full_path)
    print(f"Saved full t-test results: {full_path}")

    # Extract significant pairs (p < 0.05)
    sig_pairs = []
    for i in models:
        for j in models:
            if i != j and pd.notna(pvals.loc[i, j]) and pvals.loc[i, j] < 0.05:
                sig_pairs.append((i, j, pvals.loc[i, j]))

    # Create DataFrame, sort by p-value, add Rank column
    sig_df = pd.DataFrame(sig_pairs, columns=["Model_1", "Model_2", "p_value"])
    sig_df = sig_df.sort_values(by="p_value").reset_index(drop=True)
    sig_df.index += 1  # Start rank from 1
    sig_df.insert(0, "Rank", sig_df.index)

    sig_path = SAVE_DIR / f"Horizon{h}_significant.csv"
    sig_df.to_csv(sig_path, index=False)
    print(f"Saved ranked significant pairs (p < 0.05): {sig_path}")


Running paired t-tests on model predictions (significant results will be ranked by p-value):

Looking in: /Users/natalieleung/Desktop/COMP5530M-Group-Project-Inflation-Forecasting/Predictions/Horizon1 — Exists? True
Horizon 1 — 20 models: ['ARDL', 'NBEATSx', 'SARIMAX1990', 'ARIMAX1990', 'RFX', 'TCN', 'SARIMA1990', 'MARS', 'NHITS', 'Tide', 'ARIMA1990', 'VARcointegration', 'Naive', 'LSTM', 'VARccf', 'GRU1990', 'MLR', 'TFT', 'XGB1990', 'RNN1990']
Saved full t-test results: /Users/natalieleung/Desktop/COMP5530M-Group-Project-Inflation-Forecasting/Evaluation/t_test_results/Horizon1_full.csv
Saved ranked significant pairs (p < 0.05): /Users/natalieleung/Desktop/COMP5530M-Group-Project-Inflation-Forecasting/Evaluation/t_test_results/Horizon1_significant.csv

Looking in: /Users/natalieleung/Desktop/COMP5530M-Group-Project-Inflation-Forecasting/Predictions/Horizon3 — Exists? True
Horizon 3 — 16 models: ['NBEATSx', 'ARDL', 'TCN', 'RFX', 'NHITS', 'MARS', 'Tide', 'VARccf', 'LSTM', 'Naive', 'VARcoi