In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error

In [17]:
train_file = "../Data/Train/trains1990s.csv"

# Load the data
df = pd.read_csv(train_file, parse_dates=['observation_date'], date_format="%m/%y"
)
df = df[['observation_date', 'fred_PCEPI', 'fred_AHETPI']] # Use average hourly earnings cause it is not possible to train VAR with only one variable
df.set_index('observation_date', inplace=True)

In [18]:
numVals = df.shape[0]
valRatio = 0.2
splitLoc = int(numVals * (1 - valRatio))

train = df.iloc[:splitLoc]
val = df.iloc[splitLoc:]

In [25]:
# Prepare the VAR model with both variables
model = VAR(train[['fred_PCEPI', 'fred_AHETPI']])
# Let the model select the best lag up to 5.
model_fit = model.fit(maxlags=5, ic='aic')
print(model_fit.summary())

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Mon, 17, Feb, 2025
Time:                     22:19:14
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                   -11.3638
Nobs:                     292.000    HQIC:                  -11.5299
Log likelihood:           892.901    FPE:                8.80001e-06
AIC:                     -11.6408    Det(Omega_mle):     8.17266e-06
--------------------------------------------------------------------
Results for equation fred_PCEPI
                    coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------------
const                  0.647515         0.250999            2.580           0.010
L1.fred_PCEPI          1.448072         0.059372           24.390           0.000
L1.fred_AHETPI         0.172199         0.419095   

  _index = to_datetime(index)
  self._init_dates(dates, freq)


In [20]:
lag_order = model_fit.k_ar  # number of lags used in the model
print(f"Using lag order: {lag_order}")

# Use the last lag_order from the training data to forecast
forecast_input = train[['fred_PCEPI', 'fred_AHETPI']].values[-lag_order:]
forecast_steps = len(val)
forecast = model_fit.forecast(y=forecast_input, steps=forecast_steps)
forecast = forecast.flatten()  # flatten to 1D array

Using lag order: 5


In [24]:
# Calculate RMSE for the Validation Period
forecast_df = pd.DataFrame(forecast, index=val.index, columns=['fred_PCEPI', 'fred_AHETPI'])
rmse = np.sqrt(mean_squared_error(val['fred_PCEPI'].values, forecast_df['fred_PCEPI'].values))
print(f'Validation RMSE: {rmse}')

ValueError: Shape of passed values is (150, 1), indices imply (75, 2)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(train.index, train['fred_PCEPI'], label='Train')
plt.plot(val.index, val['fred_PCEPI'], label='Actual', color='blue')
plt.plot(val.index, forecast, label='Forecast', linestyle='--', color='red')
plt.title('VAR (AR) Model Forecast for fred_PCEPI')
plt.xlabel('Date')
plt.ylabel('fred_PCEPI')
plt.legend()
plt.show()