In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Parameters and File Paths

# Parameters for data
WINDOW = 21     # rolling window size to use as predictors
DATE_COL = 'Date'
ID_COL = 'PERMNO'
TARGET_COL = 'excess_return'

# File paths for the prediction results files of different models
current_directory = os.getcwd()
models_results_path = os.path.join(current_directory, 'Results', f'models_results{WINDOW:.0f}.csv')
chronos_models_results_path = os.path.join(current_directory, 'Results', f'chronos_models_results{WINDOW:.0f}.csv')
timesfm_models_results_path = os.path.join(current_directory, 'Results', f'timesfm_models_results{WINDOW:.0f}.csv')
uni2ts_models_results_path = os.path.join(current_directory, 'Results', f'uni2ts_models_results{WINDOW:.0f}.csv')

# File path to save merged prediction results of various models
merged_results_path = os.path.join(current_directory, 'Results', f'merged_results{WINDOW:.0f}.csv')

In [None]:
# Dictionary for model names
models_dict = {"ols": "OLS",
               "lasso": "Lasso",
               "ridge": "Ridge",
               "enet": "Elastic Net",
               "rf": "RF",
               "xgb": "XGB",
               "nn1": "NN1",
               "nn2": "NN2",
               "nn3": "NN3",
               "nn4": "NN4",
               "nn5": "NN5",
               "tfm1": "TimesFM 1.0",
               "tfm2": "TimesFM 2.0",
               "chr_bolt_tiny": "Chronos-Bolt-Tiny",
               "chr_bolt_mini": "Chronos-Bolt-Mini",
               "chr_bolt_small": "Chronos-Bolt-Small",
               "chr_bolt_base": "Chronos-Bolt-Base",
               "chr_t5_tiny": "Chronos-T5-Tiny",
               "chr_t5_mini": "Chronos-T5-Mini",
               "chr_t5_small": "Chronos-T5-Small",
               "moirai_s": "Moirai-Small",
               "moirai_moe_s": "Moirai-MoE-Small",
               "moirai_moe_b": "Moirai-MoE-Base"
               }

### Step 1: Load Forecast Results

In [None]:
# Collate All Prediction Results in Single Dataframe
results = pd.read_csv(models_results_path)
try:
    timesfm_models_results = pd.read_csv(timesfm_models_results_path)
    results = results.merge(timesfm_models_results, how="left", on=[ID_COL, DATE_COL, TARGET_COL])
except:
    print("TimesFM models' prediction results not added")
try:
    chronos_models_results = pd.read_csv(chronos_models_results_path)
    results = results.merge(chronos_models_results, how="left", on=[ID_COL, DATE_COL, TARGET_COL])
except:
    print("Chronos models' prediction results not added")
try:
    uni2ts_models_results = pd.read_csv(uni2ts_models_results_path)
    results = results.merge(uni2ts_models_results, how="left", on=[ID_COL, DATE_COL, TARGET_COL])
except:
    print("Uni2ts models' prediction results not added")

results.info()

##### Save Results

In [None]:
# Save Merged Prediction Results
results.to_csv(merged_results_path, index=False)

### Step 2: Evaluate and Compare Statistical Performance of Models

In [None]:
# Creating a Function to Calculate Predictive-R2 Used in the Finance Literature
def r2(y_true, y_pred):
    return 1-(((y_true-y_pred)**2).sum()/(y_true**2).sum())

In [None]:
# Calculate the Result Matrix for All the Models
results_matrix = []

models = [col for col in results.columns.to_list() if col.startswith("y_")]

for model in models:
    results_matrix.append({
        "Model": models_dict[model.replace("y_", "")],
        "R2": r2(results[TARGET_COL], results[model]),
        "Directional Accuracy": (np.sign(results[TARGET_COL]) == np.sign(results[model])).mean(),
        "DA (+ve Ret)": (np.sign(results[results[TARGET_COL]>=0][TARGET_COL]) == np.sign(results[results[TARGET_COL]>=0][model])).mean(),
        "DA (-ve Ret)": (np.sign(results[results[TARGET_COL]<0][TARGET_COL]) == np.sign(results[results[TARGET_COL]<0][model])).mean()
        })

results_matrix_df = pd.DataFrame(results_matrix)
results_matrix_df

### Step 3: Perform Test for Statistical Significance

In [None]:
# Function to Perform Pairwise Diebold-Mariano Test
def diebold_mariano(y_true, y_pred1, y_pred2, power=2):
    d  = (y_pred1 - y_true)**power - (y_pred2 - y_true)**power
    d_mean = np.mean(d)
    var_d = np.var(d, ddof=1)
    
    dm_stat = d_mean / np.sqrt(var_d / len(d))
    p_value = 2 * norm.sf(abs(dm_stat))
    return dm_stat, p_value

In [None]:
# Diebold-Mariano Test
dm_score_matrix = []
dm_p_matrix = []

for i in range(len(models)):
    model1 = models[i].replace("y_", "")
    dm_scores = {"Model": models_dict[model1]}
    dm_p = {"Model": models_dict[model1]}
    for j in range(i, len(models)):
        model2 = models[j].replace("y_", "")
        dm_scores[models_dict[model2]], dm_p[models_dict[model2]] = diebold_mariano(results[TARGET_COL], results[models[i]], results[models[j]])
    dm_score_matrix.append(dm_scores)
    dm_p_matrix.append(dm_p)

# Stat test score
dm_score_matrix_df = pd.DataFrame(dm_score_matrix)
dm_score_matrix_df.set_index("Model", inplace=True)
dm_score_matrix_df

In [None]:
# p-value
dm_p_matrix_df = pd.DataFrame(dm_p_matrix)
dm_p_matrix_df.set_index("Model", inplace=True)
dm_p_matrix_df

In [None]:
# Plot Results of Stat Test
sns.set_style("white")

scores = dm_score_matrix_df.T.copy().astype(float)
pvals = dm_p_matrix_df.T.copy().astype(float)

def stars(p):
    return "*" if p < 0.05 else ""

annot = scores.round(2).astype(str) + pvals.applymap(stars)

plt.figure(figsize=(13,11))
ax = sns.heatmap(scores, cmap="vlag", center=0, annot=annot, fmt="", linewidths=.5)
ax.set_title("Diebold–Mariano Test — Pairwise DM Statistics")
ax.tick_params(axis='x', rotation=60)

plt.tight_layout()
plt.show()