In [None]:
import sys
import os

# Go up two levels from notebook (Training/MLR) to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)

print("Project root added to sys.path:", project_root)

# Ensure the model save directory exists
model_save_path = os.path.join(project_root, 'Models', 'Weights', 'MLR')
os.makedirs(model_save_path, exist_ok=True)  # Creates directory if it doesn't exist

### Read Data

In [None]:
import pandas as pd
from sklearn import linear_model
from Training.Helper.dataPreprocessing import TRAIN_DATA_PATH_1990S, integer_index

date_col = 'observation_date'

# Load and format training data (only using PCEPI)
train_df = pd.read_csv(TRAIN_DATA_PATH_1990S, parse_dates=[date_col], date_format="%m/%y%").iloc[:,:2]
train_df = integer_index(train_df)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

target_col = 'fred_PCEPI'

# input values are the integer indices (not dates), output values are PCEPI
train_X, val_X, train_y, val_y = train_test_split(list(train_df.index), train_df[target_col], train_size=0.8, test_size=0.2, shuffle=False)
train_X, val_X = np.array(train_X).reshape(-1, 1), np.array(val_X).reshape(-1, 1)

In [None]:
train_X.shape, val_X.shape, train_y.shape, val_y.shape

In [None]:
from sklearn import linear_model

regr = linear_model.LinearRegression()
#display(train.index.values.reshape(-1, 1).shape)
#linear model on just PCEPI
regr.fit(train_X, train_y)

In [None]:
y_hat = regr.predict(val_X)

In [None]:
y_train_hat = regr.predict(train_X)

In [None]:
from Evaluation.Helper.evaluation_helpers import display_results, evaluate_model
full_X = np.concatenate((train_X, val_X))
full_y = np.concatenate((train_y, val_y))
train_dates = train_df['observation_date'].values
y_hats = np.concatenate((y_train_hat, y_hat))

display_results(full_y, y_hats, train_dates, 'LR', highlight_date_indices=[(val_X[0][0], val_X[-1][0])], highlight_colours = ['green'])

In [None]:
#re-'train' on full training set
regr.fit(full_X, full_y)

In [None]:
from Evaluation.Helper.evaluation_helpers import calc_metrics

preds_and_vals_df = pd.DataFrame(np.hstack((y_hat.reshape(-1, 1), val_y.values.reshape(-1, 1))), columns=['LR', 'ground_truth'])
calc_metrics(preds_and_vals_df, horizon=12)

In [None]:
final_val_index = val_X[-1][0]
print(f'Final date in validation set: {train_df.iloc[final_val_index]["observation_date"]}')

In [None]:
from Training.Helper.dataPreprocessing import TEST_DATA_PATH_1990S, integer_index

# Load and format test data (only using PCEPI)
test_df = pd.read_csv(TEST_DATA_PATH_1990S, parse_dates=[date_col], date_format="%m/%y%").iloc[:,:2]
test_df = integer_index(test_df, start=final_val_index+1)

In [None]:
test_X, test_y = np.array(list(test_df.index)).reshape(-1, 1), test_df[target_col]

In [None]:
y_test_hat = regr.predict(test_X)
y_train_hat  = regr.predict(train_X)
y_hat = regr.predict(val_X)

In [None]:
train_date_end = train_X[-1][0]+1
val_date_end = val_X[-1][0]+1

In [None]:
from Evaluation.Helper.evaluation_helpers import display_results

ys = np.concatenate((train_y, val_y, test_y))
y_hats = np.concatenate((y_train_hat, y_hat, y_test_hat))
xs = np.concatenate((train_X, val_X, test_X))
df_dates = pd.concat((train_df[date_col], test_df[date_col]))

display_results(ys, y_hats, df_dates.values, 'LR', highlight_date_indices=[(train_date_end, val_date_end), (val_date_end, xs[-1][0])], highlight_colours=['green', 'orange'])

In [None]:
output_path = os.path.join(project_root, "Predictions", "LR.npy")
np.save(output_path, y_test_hat)

### With Exogenous Variables

In [None]:
import pandas as pd
from Training.Helper.dataPreprocessing import TRAIN_DATA_PATH_1990S, integer_index

date_col = 'observation_date'

# Load and format training data (only using PCEPI)
train_df = pd.read_csv(TRAIN_DATA_PATH_1990S, parse_dates=[date_col], date_format="%m/%y%")
train_df = integer_index(train_df)

In [None]:
train_target = train_df[target_col]
#train_exog only contains exogenous variables in floating point format
train_exog = train_df.drop([target_col, 'observation_date'], axis=1)

In [None]:
print(f'train_exog contains only floating point values: {(train_exog.dtypes == "float64").all()}')

In [None]:
# add in the integer index for the model
train_exog['integer_index'] = train_exog.index

In [None]:
from sklearn import linear_model

regr = linear_model.LinearRegression()
#display(train.index.values.reshape(-1, 1).shape)
#linear model on just PCEPI
regr.fit(train_exog.values, train_target.values)

In [None]:
# Load and format training data (only using PCEPI)
test_df = pd.read_csv(TEST_DATA_PATH_1990S, parse_dates=[date_col], date_format="%m/%y%")
test_df = integer_index(test_df, start=train_exog.index[-1]+1)

In [None]:
test_target = test_df[target_col]
#train_exog only contains exogenous variables in floating point format
test_exog = test_df.drop([target_col, date_col], axis=1)
# add in the integer index for the model
test_exog['integer_index'] = test_exog.index

In [None]:
test_preds = regr.predict(test_exog.values)

In [None]:
test_target.values

In [None]:
test_preds

In [None]:
#predicts a very low number for some reason, there must be a variable throwing it off
regr.predict(test_exog.iloc[9].values.reshape(1, -1))

In [None]:
display_results(test_target.values, test_preds, test_df[date_col].values, 'MLR')

Interestingly, the model generally slightlty underpredicts but swings largely with the exogenous values at 10/2024. More analysis will be required to identify the cause of this.

In [None]:
output_path = os.path.join(project_root, "Predictions", "MLR.npy")
np.save(output_path, test_preds)

### Pytorch Forecasting Linear Regression Model

In [None]:
import torch
from typing import Dict
from pytorch_forecasting.models import BaseModel
from pytorch_forecasting.metrics.point import RMSE

class LinearRegressionModule(torch.nn.Module):
 
    def __init__(self, input_size : int, output_size : int):
        super(LinearRegressionModule, self).__init__()
        #a single 1-1 linear function
        self.linear = torch.nn.Linear(input_size, output_size)
 
    def forward(self, x):
        y_pred = self.linear(x)
        return y_pred

In [None]:
class LinearRegressionModel(BaseModel):
    def __init__(self, input_size: int, output_size: int, **kwargs):
        # saves arguments in signature to `.hparams` attribute, mandatory call - do not skip this
        self.save_hyperparameters()
        # pass additional arguments to BaseModel.__init__, mandatory call - do not skip this
        super().__init__(loss=RMSE(), **kwargs)
        self.network = LinearRegressionModule(
            input_size=self.hparams.input_size,
            output_size=self.hparams.output_size
        )

    def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        # x is a batch generated based on the TimeSeriesDataset
        network_input = x["encoder_cont"].squeeze(-1)
        prediction = self.network(network_input)

        # rescale predictions into target space
        prediction = self.transform_output(prediction, target_scale=x["target_scale"])

        # We need to return a dictionary that at least contains the prediction
        # The parameter can be directly forwarded from the input.
        # The conversion to a named tuple can be directly achieved with the `to_network_output` function.
        return self.to_network_output(prediction=prediction)

In [None]:
from pytorch_forecasting import TimeSeriesDataSet

encoderLength = 10
predictionLength = 10

trainDataset = TimeSeriesDataSet(
    train,
    group_ids=['group'],
    target='fred_PCEPI',
    time_idx='time_idx',
    min_encoder_length=encoderLength,
    max_encoder_length=encoderLength,
    min_prediction_length=predictionLength,
    max_prediction_length=predictionLength,
    time_varying_unknown_reals=['fred_PCEPI'],
)

valDataset = TimeSeriesDataSet.from_dataset(trainDataset, train_df, predict=True, stop_randomization=True)

In [None]:
batch_size = 1
trainLoader = trainDataset.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
valLoader = valDataset.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

No categorical variables, one continuous variable, one continuous target variable. Normalise target values.

In [None]:
model = LinearRegressionModel.from_dataset(trainDataset, input_size=encoderLength, output_size=predictionLength)

In [None]:
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import EarlyStopping

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")

#turned off logging, but by default this produces Tensorboard-interpretable logs every 50 steps
trainer = Trainer(fast_dev_run=False, callbacks=[early_stop_callback], logger=False)
trainer.fit(model, train_dataloaders=trainLoader, val_dataloaders=valLoader)

The above takes a long time to train a simple linear regressor.

In [None]:
predictions = model.predict(
    valLoader, return_y=True, trainer_kwargs=dict(accelerator="cpu")
)
RMSE()(predictions.output, predictions.y)

In [None]:
# raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions = model.predict(
    valLoader, mode="raw", return_x=True, trainer_kwargs=dict(accelerator="cpu")
)
print(raw_predictions.x)

In [None]:
model.plot_prediction(
    raw_predictions.x, raw_predictions.output, idx=0, add_loss_to_title=True
)

Seem to have got unlucky here?