In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to sys.path:", project_root)

model_save_path = os.path.join(project_root, 'Training', 'Random_Forest')
os.makedirs(model_save_path, exist_ok=True)

predictions_save_path = os.path.join(project_root, 'Predictions')
os.makedirs(predictions_save_path, exist_ok=True)


In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

In [None]:
from Training.Helper.dataPreprocessing import TRAIN_DATA_PATH_1990S, TRAIN_DATA_SPLIT

date_col   = 'observation_date'
target_col = 'fred_PCEPI'

# 1) Load the training data
df = pd.read_csv(TRAIN_DATA_PATH_1990S, parse_dates=[date_col], date_format='%m/%Y')
df = df[df[date_col] >= '1990-01-01'].reset_index(drop=True)

df = df.sort_values(by=date_col).reset_index(drop=True)


In [None]:
# 1) Create 12 lag features for the target
n_lags = 12
for lag in range(1, n_lags + 1):
    df[f'lag_{lag}'] = df[target_col].shift(lag)

# 2) Exogenous features you identified:
exog_cols = [
    'fred_AHETPI',
    'fred_GDP',
    'fred_PCUOMFGOMFG',
    'fred_A053RC1Q027SBEA',
    'fred_PPIACO',
    'fred_TERMCBPER24NS'
]

# 3) Drop rows made NaN by lagging (and ensure the exogenous columns exist too!)
all_required_cols = [target_col] + exog_cols + [f'lag_{lag}' for lag in range(1, n_lags+1)]
df = df.dropna(subset=all_required_cols).reset_index(drop=True)


In [None]:
# Combine the lag features + exogenous columns
feature_cols = [f'lag_{lag}' for lag in range(1, n_lags + 1)] + exog_cols

X = df[feature_cols]
y = df[target_col]

# 80% train, 20% validation (time-ordered, no shuffle)
split_index = int(len(X) * TRAIN_DATA_SPLIT)
X_train, y_train = X.iloc[:split_index], y.iloc[:split_index]
X_val,   y_val   = X.iloc[split_index:], y.iloc[split_index:]

In [None]:
import copy
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
fitted_rf_model = copy.deepcopy(rf_model)

In [None]:
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)

In [None]:
from Evaluation.Helper.evaluation_helpers import calc_metrics_arrays

calc_metrics_arrays(y_val.values.reshape(-1, 1), y_val_pred.reshape(-1, 1), model_names=['Random Forest X validation'])

In [None]:
X_full = pd.concat([X_train, X_val], ignore_index=True)
y_full = pd.concat([y_train, y_val], ignore_index=True)

# This is cheating - fitting on validation data and predicting on validation data
rf_model.fit(X_full, y_full)

# Evaluate on full (train+val)
y_full_pred = rf_model.predict(X_full)

In [None]:
# Overwrite 'cheating' prediction with the actual prediction from the model fitted on the training data only
y_full_pred = np.concatenate((y_train_pred, y_val_pred))

In [None]:
calc_metrics_arrays(y_full.values.reshape(-1, 1), y_full_pred.reshape(-1, 1), model_names=['Random Forest X full'])

In [None]:
# Reconstruct the actual target values in a single series
y_all = pd.concat([y_train, y_val], ignore_index=True)

# We also need corresponding dates in the same order
dates_train = df.loc[X_train.index, date_col]
dates_val   = df.loc[X_val.index,   date_col]
dates_all   = pd.concat([dates_train, dates_val], ignore_index=True)

In [None]:
from Evaluation.Helper.evaluation_helpers import display_results

display_results(y_all, y_full_pred, dates_all, 'Random Forest + Exogenous Variables')

In [None]:
from Training.Helper.dataPreprocessing import TEST_DATA_PATH_1990S

#TODO: Make the below data preprocessing into a function to avoid this code duplication
date_col   = 'observation_date'
target_col = 'fred_PCEPI'

# 1) Load the training data
df = pd.read_csv(TEST_DATA_PATH_1990S, parse_dates=[date_col], date_format='%m/%Y')

df = df.sort_values(by=date_col).reset_index(drop=True)

# 1) Create 12 lag features for the target
n_lags = 12
for lag in range(1, n_lags + 1):
    df[f'lag_{lag}'] = df[target_col].shift(lag)

# 2) Exogenous features you identified:
exog_cols = [
    'fred_AHETPI',
    'fred_GDP',
    'fred_PCUOMFGOMFG',
    'fred_A053RC1Q027SBEA',
    'fred_PPIACO',
    'fred_TERMCBPER24NS'
]

# 3) Do not drop rows made NaN by lagging, as prediction period is 12 months so all rows would be dropped
# Instead, leave as NaN and let the model handle the rest

# Combine the lag features + exogenous columns
feature_cols = [f'lag_{lag}' for lag in range(1, n_lags + 1)] + exog_cols

X = df[feature_cols]
y = df[target_col]

In [None]:
# Evaluate the model on the test set
test_prediction = rf_model.predict(X)

In [None]:
# Have a look at what the model predicted (probably underpredicting due to NaNs which are treated by default as 0s)
display_results(y, test_prediction, df[date_col], 'Random Forest X')

In [None]:
import joblib

# 1) Save final predictions
output_path = os.path.join(predictions_save_path, "Random_Forest_X.npy")
np.save(output_path, test_prediction)
print("Combined (train+val) multivariate predictions saved to:", output_path)


In [None]:
# 2) Save the Random Forest model itself
model_filename = os.path.join(model_save_path, 'Random_Forest_X.pkl')
joblib.dump(rf_model, model_filename)
print("Random Forest multivariate model saved to:", model_filename)