# ARDL model

In [18]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join("..", "..")))  # adds root of the project

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.statespace.structural import UnobservedComponents

from Training.Helper.dataPreprocessing import (
    add_time_features,
    add_lagged_features,
    add_rolling_features,
    rank_features_ccf,
    sklearn_fit_transform,
    integer_index,
    TRAIN_DATA_PATH_1990S
)

HORIZON = 12
DATA_PATH = TRAIN_DATA_PATH_1990S

## Version 1: Diff of Log(PCEPI) with Target Lag and PCA Features
- Log-differenced target (\u0394log)
- Top 40 CCF-selected features
- PCA(20) with lagged target + Kalman smoothing
- Includes Fourier terms

### Load and Prepare Data

In [3]:
# Load raw dataset and prepare datetime
df = pd.read_csv(DATA_PATH)
df["ds"] = pd.to_datetime(df["observation_date"], format="%m/%Y")
df = df.rename(columns={"fred_PCEPI": "y_original"})

# Add basic time features (month, quarter, etc.)
df = add_time_features(df, date_col="ds")

# Add Fourier terms (harmonics to capture seasonality)
for k in [1, 2, 3, 4]:
    df[f"sin_{k}"] = np.sin(2 * np.pi * k * df["month"] / 12)
    df[f"cos_{k}"] = np.cos(2 * np.pi * k * df["month"] / 12)

2025-04-08 23:10:42,314 - INFO - Added time features: year, month, quarter. DataFrame shape: (408, 363)


### Add Lag, Momentum, Rolling Stats

In [4]:
# Add momentum, lagged values, and rolling window features for the target
df["pct_change"] = df["y_original"].pct_change()
df["momentum"] = df["pct_change"].diff()
df = add_lagged_features(df, ["y_original"], lags=[1, 6, 12])
df = add_rolling_features(df, "y_original", windows=[3, 6, 12])
df["y_original_rolling_skew6"] = df["y_original"].rolling(6).skew()
df["y_original_rolling_kurt6"] = df["y_original"].rolling(6).kurt()

# Remove NaNs after feature generation
df.dropna(inplace=True)

### Transform Target

In [5]:
# Apply log + differencing for stationarity of the target
df["y_log"] = np.log(df["y_original"])
df["y"] = df["y_log"].diff()
df.dropna(inplace=True)

### Feature Selection and Dimensionality Reduction

In [6]:
# Select top exogenous features using CCF (correlation with target)
exog_df = df.drop(columns=["observation_date", "ds", "y_original", "y_log", "y"])
exog_df["fred_PCEPI"] = df["y_original"]
selected_features = rank_features_ccf(exog_df, targetCol="fred_PCEPI")[:40]

# Scale + PCA for feature reduction
X = df[selected_features].copy()
X_scaled, scaler = sklearn_fit_transform(X, StandardScaler())
X_scaled = X_scaled[0]
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)
X_pca_df = pd.DataFrame(X_pca, index=df.index, columns=[f"PC{i+1}" for i in range(20)])
X_pca_df = add_lagged_features(X_pca_df, target_cols=X_pca_df.columns.tolist(), lags=[1])

### Prepare Final Model DataFrame

In [7]:
# Combine PCA and target into final modeling DataFrame
df_model = pd.concat([df[["ds", "y"]], X_pca_df], axis=1)
df_model.dropna(inplace=True)
df_model = integer_index(df_model)

# Train/Validation split
train_df = df_model.iloc[:-HORIZON]
val_df = df_model.iloc[-HORIZON:]

### Train Model

In [8]:
# Use lagged target and PCs as features for regression
X_cols = [col for col in train_df.columns if col.startswith("PC")]
target_lags = 2
Y_train = train_df["y"].values[target_lags:]
X_train = np.hstack([
    train_df[X_cols].iloc[target_lags:].values,
    np.column_stack([train_df["y"].shift(l).values[target_lags:] for l in range(1, target_lags + 1)])
])

# Train Lasso and Ridge using grid search
models = {}
for name, reg in {"Lasso": Lasso(max_iter=5000), "Ridge": Ridge()}.items():
    grid = GridSearchCV(reg, {"alpha": [0.001, 0.01, 0.1, 1.0]}, cv=3)
    grid.fit(X_train, Y_train)
    models[name] = grid.best_estimator_

### Forecast

In [9]:
# Perform autoregressive rolling prediction for next 12 months
val_pca = val_df[X_cols].reset_index(drop=True)
prev_y = list(train_df["y"].iloc[-target_lags:])
base_log = df["y_log"].iloc[-HORIZON - 1]

results = {}
for name, model in models.items():
    pred_diff, y_lags = [], prev_y.copy()
    for i in range(HORIZON):
        X_exog = val_pca.iloc[i].values
        X_input = np.concatenate([X_exog, y_lags[::-1]]).reshape(1, -1)
        pred = model.predict(X_input)[0]
        pred_diff.append(pred)
        y_lags = y_lags[1:] + [pred]
    y_log_forecast = np.cumsum(pred_diff) + base_log
    y_forecast = np.exp(y_log_forecast)
    results[name] = y_forecast

### Kalman Smoothing + Final Forecast

In [10]:
# Using Kalman smoothing on the best model prediction
y_true = df["y_original"].iloc[-HORIZON:].values
best_model = min(results, key=lambda k: mean_absolute_error(y_true, results[k]))
model_kalman = UnobservedComponents(results[best_model], level='llevel')
res_kalman = model_kalman.fit(disp=False)
smoothed_v1 = res_kalman.smoothed_state[0]

## Version 2: Simpler log(PCEPI) Regression with PCA + Kalman
- Target: log-transformed PCEPI (no differencing)
- Features: 30 top CCF-selected exogenous features
- PCA(12) dimensionality reduction (fewer components than v1)
- No lagged target in regression
- Simple prediction + Kalman smoothing

### Data Loading + Feature Engineering

In [11]:
# V2: Load and basic time features
df = pd.read_csv(DATA_PATH)
df["ds"] = pd.to_datetime(df["observation_date"], format="%m/%Y")
df = df.rename(columns={"fred_PCEPI": "y"})
df = add_time_features(df, date_col="ds")

# Technical features
df["pct_change"] = df["y"].pct_change()
df["momentum"] = df["pct_change"].diff()
df = add_lagged_features(df, ["y"], lags=[1, 6, 12])
df = add_rolling_features(df, "y", windows=[3, 6, 12])
df["y_rolling_skew6"] = df["y"].rolling(6).skew()
df["y_rolling_kurt6"] = df["y"].rolling(6).kurt()

# Remove NaNs after feature generation
df.dropna(inplace=True)

# Target: log(PCEPI)
df["y_log"] = np.log(df["y"])

2025-04-08 23:10:56,550 - INFO - Added time features: year, month, quarter. DataFrame shape: (408, 363)


### Feature Selection + PCA for V2

In [12]:
# Drop again just to ensure CCF doesn't see NaNs
df.dropna(inplace=True)

# Feature selection
exog_df = df.drop(columns=["observation_date", "ds", "y", "y_log"])
exog_df["target"] = df["y_log"]
selected_features = rank_features_ccf(exog_df, targetCol="target")[:30]

# PCA
X = df[selected_features].copy()
X_scaled, scaler = sklearn_fit_transform(X, StandardScaler())
X_scaled = X_scaled[0]

pca = PCA(n_components=12)
X_pca = pca.fit_transform(X_scaled)
X_pca_df = pd.DataFrame(X_pca, index=df.index, columns=[f"PC{i+1}" for i in range(12)])
X_pca_df = add_lagged_features(X_pca_df, X_pca_df.columns.tolist(), lags=[1])

### Prepare Model DF + Train/Val Split

In [13]:
# Final model dataset with target and PCA features
df_model = pd.concat([df[["ds", "y_log"]], X_pca_df], axis=1)
df_model.dropna(inplace=True)
df_model = integer_index(df_model)

# Split into train and validation sets
train_df = df_model.iloc[:-HORIZON]
val_df = df_model.iloc[-HORIZON:]

X_cols = [col for col in train_df.columns if col.startswith("PC")]
Y_train = train_df["y_log"].values
X_train = train_df[X_cols].values

### Train Lasso + Ridge Models (No Target Lags)

In [14]:
# Fit models using only PCA features (no lagged target)
models = {}
for name, reg in {"Lasso": Lasso(max_iter=5000), "Ridge": Ridge()}.items():
    grid = GridSearchCV(reg, {"alpha": [0.001, 0.01, 0.1, 1.0]}, cv=3)
    grid.fit(X_train, Y_train)
    models[name] = grid.best_estimator_

### Forecast

In [15]:
# Predict directly without rolling, then invert log
val_X = val_df[X_cols].values
results = {}
for name, model in models.items():
    pred_log = model.predict(val_X)
    pred = np.exp(pred_log)
    results[name] = pred

### Kalman Smoothing on V2 Prediction

In [16]:
# Kalman smoothing on best performing model
best_model = min(results, key=lambda k: mean_absolute_error(df["y"].iloc[-HORIZON:], results[k]))
model_kalman = UnobservedComponents(results[best_model], level='local level')
res_kalman = model_kalman.fit(disp=False)
smoothed_v2 = res_kalman.smoothed_state[0]

### Why Version 2 Was Better

- V2 outperformed V1 and V3 in both RMSE and trend alignment.
- It uses fewer features (30 vs. 40) and avoids differencing, making it more interpretable.
- It skips lagged target prediction and uses a cleaner pipeline with less variance.
- Final smoothed predictions track inflation trend well with minimal overshooting.

## Version 3: Added month dummies + seasonality to v2
- But leads to overfitting
- Performs worse across all metrics
- Omitted from execution as v2 performs better.

## Save Final ARDL_v2 Predictions

In [17]:
np.save(os.path.join("..", "..", "Predictions", "ARDL.npy"), smoothed_v2)
pd.DataFrame({
    "observation_date": df["ds"].iloc[-HORIZON:].dt.strftime("%m/%Y"),
    "ground_truth": df["y"].iloc[-HORIZON:].values,
    "ARDL_v2": smoothed_v2
}).to_csv("ARDL.csv", index=False)

print("Final ARDL_v2 saved as best performing version")

Final ARDL_v2 saved as best performing version
