In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to sys.path:", project_root)

model_save_path = os.path.join(project_root, 'Training', 'Random_Forest')
os.makedirs(model_save_path, exist_ok=True)

predictions_save_path = os.path.join(project_root, 'Predictions')
os.makedirs(predictions_save_path, exist_ok=True)


In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt


In [None]:
# Paths to your CSV files
TRAIN_DATA_PATH_1990S = os.path.join(project_root, 'Data', 'Train', 'train1990s.csv')

date_col   = 'observation_date'
target_col = 'fred_PCEPI'

# 1) Load the training data
df = pd.read_csv(TRAIN_DATA_PATH_1990S, parse_dates=[date_col], date_format='%m/%Y')
df = df[df[date_col] >= '1990-01-01'].reset_index(drop=True)

df = df.sort_values(by=date_col).reset_index(drop=True)


In [None]:
# 1) Create 12 lag features for the target
n_lags = 12
for lag in range(1, n_lags + 1):
    df[f'lag_{lag}'] = df[target_col].shift(lag)

# 2) Exogenous features you identified:
exog_cols = [
    'fred_AHETPI',
    'fred_GDP',
    'fred_PCUOMFGOMFG',
    'fred_A053RC1Q027SBEA',
    'fred_PPIACO',
    'fred_TERMCBPER24NS'
]

# 3) Drop rows made NaN by lagging (and ensure the exogenous columns exist too!)
all_required_cols = [target_col] + exog_cols + [f'lag_{lag}' for lag in range(1, n_lags+1)]
df = df.dropna(subset=all_required_cols).reset_index(drop=True)


In [None]:
# Combine the lag features + exogenous columns
feature_cols = [f'lag_{lag}' for lag in range(1, n_lags + 1)] + exog_cols

X = df[feature_cols]
y = df[target_col]

# 80% train, 20% validation (time-ordered, no shuffle)
split_index = int(len(X) * 0.8)
X_train, y_train = X.iloc[:split_index], y.iloc[:split_index]
X_val,   y_val   = X.iloc[split_index:], y.iloc[split_index:]


In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:
y_val_pred = rf_model.predict(X_val)

val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_r2   = r2_score(y_val, y_val_pred)

print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Validation R²:   {val_r2:.4f}")


In [None]:
X_full = pd.concat([X_train, X_val], ignore_index=True)
y_full = pd.concat([y_train, y_val], ignore_index=True)

rf_model.fit(X_full, y_full)

# Evaluate on full (train+val)
y_full_pred = rf_model.predict(X_full)

full_rmse = np.sqrt(mean_squared_error(y_full, y_full_pred))
full_r2   = r2_score(y_full, y_full_pred)

print(f"Final Combined RMSE: {full_rmse:.4f}")
print(f"Final Combined R²:   {full_r2:.4f}")


In [None]:
# Reconstruct the actual target values in a single series
y_all = pd.concat([y_train, y_val], ignore_index=True)

# We also need corresponding dates in the same order
dates_train = df.loc[X_train.index, date_col]
dates_val   = df.loc[X_val.index,   date_col]
dates_all   = pd.concat([dates_train, dates_val], ignore_index=True)

plt.figure(figsize=(10, 5))
plt.plot(dates_all, y_all, label='Actual PCEPI')
plt.plot(dates_all, y_full_pred, label='RF (Multivariate) Prediction')
plt.xlabel('Date')
plt.ylabel('PCEPI')
plt.title('Random Forest + Exogenous Variables (Train + Val)')
plt.legend()
plt.show()


In [None]:
import joblib

# 1) Save final predictions
output_path = os.path.join(predictions_save_path, "Random_Forest_X.npy")
np.save(output_path, y_full_pred)
print("Combined (train+val) multivariate predictions saved to:", output_path)


In [None]:
# 2) Save the Random Forest model itself
model_filename = os.path.join(model_save_path, 'Random_Forest_X.pkl')
joblib.dump(rf_model, model_filename)
print("Random Forest multivariate model saved to:", model_filename)