## VAR Model Training


In [None]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pickle
from datetime import timedelta

# Set root and paths
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(ROOT_PATH)

from Training.Helper.dataPreprocessing import make_stationary, granger_causes
from Models.VAR import VARModel

# Load training data
train_file = os.path.join(ROOT_PATH, "Data", "Train", "train1990s.csv")
df = pd.read_csv(train_file, parse_dates=["observation_date"])
df.set_index("observation_date", inplace=True)

# Set target column
target_col = "fred_PCEPI"
HORIZONS = [1, 3, 6, 12]
max_lag = 1
max_features = 1000

# === GRANGER FILTERING ===
print("Filtering non-Granger-causing features...")
kept_cols = [target_col]
for col in df.columns:
    if col != target_col:
        try:
            stationary = make_stationary(df, target_col, col)
            if stationary is not np.nan and granger_causes(df, col, target_col):
                kept_cols.append(col)
        except Exception as e:
            print(f"Skipping {col}: {e}")

df_filtered = df[kept_cols]
print(f"Remaining after Granger filtering: {len(kept_cols) - 1} exogenous variables")

  df = pd.read_csv(train_file, parse_dates=["observation_date"])


Filtering non-Granger-causing features...




Remaining after Granger filtering: 345 exogenous variables




In [3]:
# === LOAD RANKED FEATURES FROM CSV ===
print("Using pre-ranked highly correlated variables...")
ranked_path = os.path.join(ROOT_PATH, "Data Manipulation", "highly_correlated_significant_variables.csv")
ranked_df = pd.read_csv(ranked_path)
ranked_exog = ranked_df.iloc[1:, 0].tolist()  # Skip header/target

print(f"Total pre-ranked exogenous variables available: {len(ranked_exog)}")

# === STANDARDIZE DATA ===
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_filtered), index=df_filtered.index, columns=df_filtered.columns)


Using pre-ranked highly correlated variables...
Total pre-ranked exogenous variables available: 106


In [4]:
# === VAR MODEL TRAINING LOOP ===
print("Evaluating top-N VAR models...")
best_model = None
best_subset = []
best_forecast_df = None
best_rmse = float("inf")

for k in range(1, min(len(ranked_exog), max_features) + 1):
    selected_cols = [target_col] + [col for col in ranked_exog[:k] if col in df_scaled.columns]
    try:
        model = VARModel()
        model_fit = model.fit(df_scaled[selected_cols], maxlags=max_lag)
        forecast_input = df_scaled[selected_cols].values[-model_fit.k_ar:]
        forecast = model_fit.forecast(y=forecast_input, steps=12)

        forecast_idx = pd.date_range(start=df_scaled.index[-1] + timedelta(days=1), periods=12, freq="MS")
        forecast_scaled_df = pd.DataFrame(forecast, index=forecast_idx, columns=selected_cols)

        # Inverse transform forecast
        dummy = pd.DataFrame(0, index=forecast_idx, columns=df_scaled.columns)
        dummy.update(forecast_scaled_df[[target_col]])
        forecast_all = scaler.inverse_transform(dummy)
        forecast_target = forecast_all[:, df_scaled.columns.get_loc(target_col)]

        # RMSE = std since true values are unknown
        rmse = np.std(forecast_target)

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model_fit
            best_subset = selected_cols
            best_forecast_df = pd.DataFrame({"Predicted": forecast_target}, index=forecast_idx)

    except Exception as e:
        continue

# === SAVE FORECAST ===
PRED_PATH = os.path.join(ROOT_PATH, "Predictions", "VAR_cointegration.npy")
np.save(PRED_PATH, best_forecast_df["Predicted"].values)
print(f"Saved forecast values to {PRED_PATH}")

# === SAVE MODEL ===
WEIGHTS_DIR = os.path.join(ROOT_PATH, "Models", "Weights", "VARX")
os.makedirs(WEIGHTS_DIR, exist_ok=True)
model_path = os.path.join(WEIGHTS_DIR, "VAR_cointegration_model.pkl")
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)
print(f"Saved model to {model_path}")

# === PRINT SUMMARY ===
print(f"\nBest model used {len(best_subset) - 1} exogenous variables:")
print(f"Selected variables: {best_subset}")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 2.42235898 2.44661927 2.4710108  2.49553478 2.52019244 2.54498498]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.42353254 2.44810707 2.47284405 2.49774299 2.52280346 2.54802505]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.44880479 2.47674706 2.50473632 2.53275626 2.56079159 2.58882809]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.44066654 2.46947922 2.49874795 2.52842128 2.55844878 2.58878141]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(d

Evaluating top-N VAR models...


  self._init_dates(dates, freq)
 2.38374213 2.40251431 2.42143516 2.44047019 2.45958467 2.47874414]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.38374213 2.40251431 2.42143516 2.44047019 2.45958467 2.47874414]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.39218119 2.41256976 2.43334702 2.45449821 2.4760026  2.49783474]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.3846406  2.40438111 2.42460159 2.44528801 2.4664191  2.48796767]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.40254162 2.425113

Saved forecast values to /Users/natalieleung/Desktop/COMP5530M-Group-Project-Inflation-Forecasting/Predictions/VAR_cointegration.npy
Saved model to /Users/natalieleung/Desktop/COMP5530M-Group-Project-Inflation-Forecasting/Models/Weights/VARX/VAR_cointegration_model.pkl

Best model used 40 exogenous variables:
Selected variables: ['fred_PCEPI', 'fred_GDP', 'fred_PCUOMFGOMFG', 'fred_A053RC1Q027SBEA', 'fred_PPIACO', 'fred_TERMCBPER24NS', 'fred_M2SL', 'fred_DGS10', 'fred_CSUSHPISA', 'BrentOil_Vol.', 'fred_EXINUS', 'Gold_Price', 'CMO-Historical-Data-Monthly_Other_foods', 'SP500_Price', 'SP500_High', 'SP500_Low', 'CMO-Historical-Data-Monthly_Tea, Colombo', 'fred_GFDEGDQ188S', 'CrudeOilWTI_Vol.', 'CMO-Historical-Data-Monthly_Tin', 'CMO-Historical-Data-Monthly_Beef **', 'CMO-Historical-Data-Monthly_Lead', 'NASDAQComposite_Low', 'NASDAQComposite_Price', 'CMO-Historical-Data-Monthly_Rice, Thai A.1', 'fred_APU000074714', 'food_price_indices_data_f_Dairy', 'Copper_Low', 'Copper_Open', 'Copper_Pric

  self._init_dates(dates, freq)
 2.36182146 2.37542629 2.39004479 2.40590115 2.42315526 2.44191111]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.3589236  2.37206017 2.38612426 2.40126674 2.41761259 2.43526498]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.35815383 2.3716112  2.38596101 2.40129956 2.41772801 2.43535599]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.35348135 2.36645036 2.38035505 2.39524397 2.4111702  2.42820786]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  dummy.update(forecast_scaled_df[[target_col]])
  self._init_dates(dates, freq)
 2.35146344 2.363670