In [None]:
#allows imports from other folders in project
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# ARDL model

### Load Data

In [None]:
import pandas as pd
import numpy as np
from Helper.dataPreprocessing import TRAIN_DATA_PATH_1990S

# Loading the data
df = pd.read_csv(TRAIN_DATA_PATH_1990S)

# Convert to datetime and set as index
df["observation_date"] = pd.to_datetime(df["observation_date"])
df.set_index("observation_date", inplace=True)

### Data Preprocessing

In [None]:
from Helper.dataPreprocessing import add_modified_feature

# Identify target column and add new column
target_col = 'fred_PCEPI'
df = add_modified_feature(df, target_col, np.log)
log_col = df.columns[-1]

In [None]:
from Helper.dataPreprocessing import best_lag_selection, train_val_test_split

# Train/Test split and best lag selection
train_X, train_y, _, _, test_X, test_y = train_val_test_split(df[log_col], df[target_col], train_size=0.9, val_size=0)

print(f"Train set size: {len(train_X)}")
print(f"Test  set size: {len(test_X)}")

best_lag = best_lag_selection(train_X, max_lags=12)
print(f"Selected best lag (TRAIN only) for ARDL in levels: {best_lag}")

### Define and Fit ARDL Model

In [None]:
from statsmodels.tsa.ardl import ARDL

# ARDL model with a linear trend since no exogenous variables
final_model = ARDL(endog=train_X, lags=best_lag, trend='ct').fit()
print(final_model.summary())

### Make Predictions

In [None]:
from pykalman import KalmanFilter

# Forecast in log-levels
start_idx = test_X.index[0]
end_idx   = test_X.index[-1]

pred_log_test = final_model.predict(start=start_idx, end=end_idx)
pred_log_test = pd.Series(pred_log_test, index=test_X.index)

# Exponentiate to get back to original scale (raw predictions)
predictions_raw = np.exp(pred_log_test)  # "raw" ARDL forecast in original PCEPI scale

# Kalman Filter on the test forecast 
kf = KalmanFilter(initial_state_mean=predictions_raw.iloc[0], n_dim_obs=1)
kf = kf.em(predictions_raw, n_iter=5)

predictions_smoothed_arr, _ = kf.filter(predictions_raw)
predictions_smoothed = pd.Series(predictions_smoothed_arr.flatten(), 
                                 index=predictions_raw.index)

### Evaluate

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
# Evaluation
y_true = test_y

# Align both raw and smoothed with the test index
y_pred_raw     = predictions_raw.reindex(y_true.index)
y_pred_smoothed = predictions_smoothed.reindex(y_true.index)

# Evaluate RAW
mae_raw = mean_absolute_error(y_true, y_pred_raw)
rmse_raw = np.sqrt(mean_squared_error(y_true, y_pred_raw))

# Evaluate SMOOTHED
mae_smooth = mean_absolute_error(y_true, y_pred_smoothed)
rmse_smooth = np.sqrt(mean_squared_error(y_true, y_pred_smoothed))

print(f"\n=== Evaluation on Test Set ===")
print(f"RAW Forecast  =>  MAE = {mae_raw:.4f},  RMSE = {rmse_raw:.4f}")
print(f"Smoothed      =>  MAE = {mae_smooth:.4f},  RMSE = {rmse_smooth:.4f}")

# Plotting
plt.figure(figsize=(12,6))
plt.plot(y_true.index, y_true, label="Actual (Test)", marker="o")
plt.plot(y_pred_raw.index, y_pred_raw, label="Predicted (Raw)", marker="*")
plt.plot(y_pred_smoothed.index, y_pred_smoothed, 
         label="Predicted (Smoothed)", linestyle="--")
plt.legend()
plt.title("ARDL (Levels) + Kalman Filter: RAW vs. SMOOTHED Predictions")
plt.show()

# With Exogenous Variables to catch trends

### Load Data

In [None]:
# Loading the data
file_path = "../Data/train/train1990s.csv"
df = pd.read_csv(file_path)

# Convert to datetime and set as index
df["observation_date"] = pd.to_datetime(df["observation_date"])
df.set_index("observation_date", inplace=True)

### Data Preprocessing

In [None]:
# Identify target column and add new column
target_col = 'fred_PCEPI'
df = add_modified_feature(df, target_col, np.log)
log_col = df.columns[-1]

In [None]:
# Split into train and test sets
exogenous_columns = [col for col in df.columns if col not in [log_col, target_col]]
train_X, train_y, _, _, test_X, test_y = train_val_test_split(df[exogenous_columns], df[target_col], train_size=0.9, val_size=0)

print(f"Train set size: {len(train_X)}")
print(f"Test  set size: {len(test_X)}")

In [None]:
from Helper.dataPreprocessing import drop_near_constant_cols

# Remove near-constant columns according to standard deviation (there are none in the current dataset)
train_X_clean, dropped_cols = drop_near_constant_cols(train_X, threshold=1e-6)
test_X_clean = test_X.drop(columns=dropped_cols)
print(f"\nDropped near-constant exogenous columns: {dropped_cols}")
print(f"Number of remaining exogenous columns after drop: {len(train_X_clean.columns)}")

In [None]:
from Helper.dataPreprocessing import sklearn_fit_transform
from sklearn.decomposition import PCA

NUM_COMPONENTS = 10

train_exog, test_exog = sklearn_fit_transform(train_X_clean, test_X_clean, PCA, n_components=NUM_COMPONENTS)

In [None]:
from Helper.dataPreprocessing import add_lagged_features

train_exog_lags, test_exog_lags = add_lagged_features(train_exog, list(train_exog.columns), lags=[1]), add_lagged_features(test_exog, list(test_exog.columns), lags=[1])
# Drops the first max(lags) rows (in this case, 1), since there are NaN values from having shifted by the number of lags
train_exog_lags.dropna(inplace=True)
test_exog_lags.dropna(inplace=True)
# Drop the same rows in the targets as were dropped in the inputs
train_target = df[log_col].loc[train_exog_lags.index]
test_target = test_y.loc[test_exog_lags.index]

print(f"\nAfter adding 1 lag, train exog shape: {train_exog_lags.shape}")
print(f"Train target shape: {train_target.shape}")
print(f"Test exog shape: {test_exog_lags.shape}")
print(f"Test target shape: {test_target.shape}")

In [None]:
from Helper.dataPreprocessing import best_lag_selection

best_lag = best_lag_selection(train_target, max_lags=12, verbose=True)

In [None]:
from Helper.dataPreprocessing import integer_index

# Convert everything to integer indexing
# Because ARDL tries to produce date-based forecasts for the entire date range,
# we can convert both train and test to 0-based integer index so we produce
# exactly as many forecasts as test_exog_lags rows.
#can input all DataFrames we wish to integer index as a list, and a list is returned
train_exog_lags_int, test_exog_lags_int, train_target_int, test_target_int = integer_index([train_exog_lags, test_exog_lags, train_target, test_target])

### Define and Fit ARDL Model

In [None]:
# ARDL model using integer index
final_model = ARDL(
    endog=train_target_int,
    exog=train_exog_lags_int,
    lags=best_lag,
    trend='ct'
).fit()
print(final_model.summary())

### Make Predictions

In [None]:
# Forecast with Exog out-of-sample (Integer-based)
# Will produce exactly `len(test_exog_lags_int)` forecasts
start_i = len(train_exog_lags_int)
end_i   = start_i + len(test_exog_lags_int) - 1

pred_log_test = final_model.predict(
    start=start_i,
    end=end_i,
    exog_oos=test_exog_lags_int  # The same # of rows as the forecast steps
)

# pred_log_test is now indexed from `start_i` to `end_i`
# re-index it to 0..len(test_exog_lags_int)-1 for convenience:
pred_log_test.index = test_exog_lags_int.index

# Exponentiate back to original scale
predictions_raw = np.exp(pred_log_test)

In [None]:
# Kalman Filter
kf = KalmanFilter(initial_state_mean=predictions_raw.iloc[0], n_dim_obs=1)
kf = kf.em(predictions_raw, n_iter=5)
predictions_smoothed_arr, _ = kf.filter(predictions_raw)
predictions_smoothed = pd.Series(predictions_smoothed_arr.flatten(), index=predictions_raw.index)

### Evaluate

In [None]:
# Evaluation
y_true = test_target_int.reindex(test_exog_lags_int.index)  # same integer index

mae_raw  = mean_absolute_error(y_true, predictions_raw)
rmse_raw = np.sqrt(mean_squared_error(y_true, predictions_raw))

mae_smooth  = mean_absolute_error(y_true, predictions_smoothed)
rmse_smooth = np.sqrt(mean_squared_error(y_true, predictions_smoothed))

print(f"\n=== Evaluation on Test Set ===")
print(f"RAW Predictions       => MAE = {mae_raw:.4f},  RMSE = {rmse_raw:.4f}")
print(f"Smoothed Predictions  => MAE = {mae_smooth:.4f}, RMSE = {rmse_smooth:.4f}")

# Plotting
plt.figure(figsize=(12,6))
plt.plot(y_true.index, y_true, label="Actual (Test)", marker="o")
plt.plot(predictions_raw.index, predictions_raw, label="Predicted (Raw)", marker="*")
plt.plot(predictions_smoothed.index, predictions_smoothed, label="Predicted (Smoothed)", linestyle="--")
plt.legend()
plt.title("ARDL + PCA + 1-Lag PCA, with Integer-Based Forecasting (No Data Leakage)")
plt.show()