In [None]:
import sys
import os
# Manually set the project root directory (adjust if needed)
project_root = os.path.abspath("..")  # Moves up one level to project root
# Add the project directory to sys.path
sys.path.append(project_root)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
import pickle
import time
import glob
from tqdm.autonotebook import tqdm
from EarlyStopping import EarlyStopping  # Ensure EarlyStopping is available
from Training.PyTorchModular import train_model, loss_curve
from sklearn.metrics import mean_squared_error
# Now import the RNN model
from Models.RNN import RNNModel

### Create/ Clean Model Weight Directory

In [None]:
import shutil

# **Ensure Model Save Path Exists**
model_save_path = os.path.join('..', 'Models', 'Weights', 'RNN')
os.makedirs(model_save_path, exist_ok=True)  # Create if not exists

# **Delete all existing files in the RNN model directory**
for file in os.listdir(model_save_path):
    file_path = os.path.join(model_save_path, file)
    if os.path.isfile(file_path) or os.path.islink(file_path):
        os.unlink(file_path)  # Remove files and symlinks

### Load Data

In [None]:

# **Define relative file path for training data**
train_file = os.path.join('..', 'Data', 'Train', 'train1990s.csv')

# **Load Training Data with Automatic Column Detection**
train_df = pd.read_csv(train_file)
#print("Columns in dataset:", train_df.columns)  # Debugging: Show available columns

In [None]:
date_col = 'observation_date'
target_col = 'fred_PCEPI'

In [None]:
from dataPreprocessing import minMaxScale

# **Normalize Data**
# Perform min-max scaling on input data (no exogenous variables)
train_X_scaled = minMaxScale(train_df[[target_col]].values)
train_series = train_X_scaled.flatten()

In [None]:
from dataPreprocessing import create_sequences

# **Set Sequence Length**
# Create sequences from the training series
sequence_length = 12
X, y = create_sequences(train_series, train_series, sequence_length)

In [None]:
from dataPreprocessing import train_val_test_split

# **Train-Validation Split (80% Train, 20% Validation)**
X_train, y_train, X_val, y_val, _, _ = train_val_test_split(X, y, train_size=0.8, val_size=0.2)

In [None]:
# **Convert Data to PyTorch Tensors**
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(-1)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(-1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).unsqueeze(-1)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(-1)

# **Create DataLoaders**
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### Train Model

In [None]:
# **Train Model Using Modular Functions**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn_model = RNNModel(input_size=1, hidden_size=64, num_layers=2).to(device)

# **Define Loss Function and Optimizer**
criterion = nn.MSELoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)

# **Train the Model Using Modular Functions**
train_data = train_model(
    model=rnn_model,
    maxEpochs=50,
    modelSavePath=model_save_path,
    modelName="RNN",
    dataLoaderTrain=train_loader,
    dataLoaderValid=val_loader,
    lossFn=criterion,
    optimizer=optimizer,
    device=device,
    batchStatusUpdate=10,
    verbose=True
)

### Evaluation

In [None]:
# **Plot Training vs. Validation Loss**
loss_curve(trainLoss=train_data["trainLoss"], validLoss=train_data["validLoss"], title="RNN Training vs. Validation Loss")

In [None]:
from Evaluation.evaluation_helpers import get_best_model_path

best_model_path = get_best_model_path(model_save_path, 'RNN')

In [None]:
# **Load the Best or Latest Model**
best_rnn_model = RNNModel(input_size=1, hidden_size=64, num_layers=2).to(device)
best_rnn_model.load_state_dict(torch.load(best_model_path))
best_rnn_model.eval()

In [None]:


# **Define relative file path for training data**
train_file = os.path.join('..', 'Data', 'Train', 'train1990s.csv')

# **Load Training Data with Automatic Column Detection**
train_df = pd.read_csv(train_file)
print("Columns in dataset:", train_df.columns)  # Debugging: Show available columns

# **Find the correct date column**
possible_date_cols = ["Date", "date", "observation_date", "timestamp"]
for col in possible_date_cols:
    if col in train_df.columns:
        train_df[col] = pd.to_datetime(train_df[col])
        train_df.set_index(col, inplace=True)
        print(f"Using '{col}' as Date column.")
        break
else:
    raise ValueError("No valid date column found in dataset!")

# **Rename Value Column (Check if it exists)**
if "Value" in train_df.columns:
    train_df.rename(columns={"Value": "PCE"}, inplace=True)
elif "fred_PCEPI" in train_df.columns:
    train_df.rename(columns={"fred_PCEPI": "PCE"}, inplace=True)
else:
    raise ValueError("No valid column found for PCE data!")

# **Normalize Data**
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_df[["PCE"]])

# **Convert the scaled data to a 1D array**
train_series = train_scaled.flatten()

# **Function to Create Sequences**
def create_sequences(data, seq_length):
    """
    Creates sequences of length `seq_length` as inputs and the subsequent value as the target.
    """
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        xs.append(data[i : i + seq_length])
        ys.append(data[i + seq_length])
    return np.array(xs), np.array(ys)

# **Set Sequence Length**
sequence_length = 12
X, y = create_sequences(train_series, sequence_length)

# **Train-Validation Split (80% Train, 20% Validation)**
split_ratio = 0.8
split_idx = int(len(X) * split_ratio)

X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

# **Convert Data to PyTorch Tensors**
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(-1)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(-1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).unsqueeze(-1)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(-1)

# **Create DataLoaders**
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# **Plot Training vs. Validation Loss**
loss_curve(trainLoss=train_data["trainLoss"], validLoss=train_data["validLoss"], title="RNN Training vs. Validation Loss")

# **Find the best saved model dynamically**
best_model_files = glob.glob(os.path.join(model_save_path, "RNN_BEST_STOPPED_AT_*.pth"))
latest_model_path = os.path.join(model_save_path, "RNN_latest.pth")

if best_model_files:
    best_model_path = sorted(best_model_files)[-1]  # Pick the latest best-stopped model
    print(f"Loading best model: {best_model_path}")
elif os.path.exists(latest_model_path):
    best_model_path = latest_model_path
    print(f"No early-stopped model found. Loading latest model instead: {best_model_path}")
else:
    raise FileNotFoundError("No saved model found! Ensure training was completed successfully.")

# **Load the Best or Latest Model**
best_rnn_model = RNNModel(input_size=1, hidden_size=64, num_layers=2).to(device)
best_rnn_model.load_state_dict(torch.load(best_model_path))
best_rnn_model.eval()


In [None]:
from Evaluation.evaluation_helpers import make_evaluation_predictions

rnn_predictions_inv, actuals_inv = make_evaluation_predictions(best_rnn_model, best_model_path, val_loader, y_scaler=scaler)

In [None]:
# **Extract the dates corresponding to the validation predictions**
val_dates = train_df.index[split_idx + sequence_length:]

# **Create a DataFrame for comparison**
df_comparison = pd.DataFrame({
    "Date": val_dates,
    "Actual PCE": actuals_inv.flatten(),
    "Predicted PCE": rnn_predictions_inv.flatten()
})

# **Display the first few rows of the comparison DataFrame**
print(df_comparison.head())

# **Plot the Actual vs. Predicted PCE values**
plt.figure(figsize=(12, 6))
plt.plot(df_comparison["Date"], df_comparison["Actual PCE"], label="Actual PCE", marker="o", linestyle="-")
plt.plot(df_comparison["Date"], df_comparison["Predicted PCE"], label="Predicted PCE (RNN)", marker="x", linestyle="--")
plt.xlabel("Date")
plt.ylabel("PCE")
plt.title("Comparison of Actual vs. Predicted PCE (RNN)")
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# **Compute RMSE for Validation Predictions**
rmse_rnn = np.sqrt(mean_squared_error(actuals_inv, rnn_predictions_inv))
print(f"Root Mean Squared Error (RMSE) for RNN Model: {rmse_rnn:.6f}")