# Statistical models in R
This notebook is covering:
1. Data preprocessing:
    1. Aligning all dataframe to 5-day week from 2018-10-01 to 2024-08-30.
    2. Interpolating missing values.
2. ARMA-models.

In [None]:
# rpy2 is a Python package that allows you to run R code from Python
%pip install rpy2

In [1]:
# Load the rpy2 extension to use R in Jupyter
%load_ext rpy2.ipython

The magic function `%%R` is used for running R code in Jupyter

In [159]:
%%R
# Install required packages
if (require("dplyr") == FALSE) {
  install.packages("dplyr")
  library(dplyr)
}
if (require("zoo") == FALSE) {
  install.packages("zoo")
  library(zoo)
}
if (require("psych") == FALSE) {
  install.packages("psych")
  library(psych)
}
if (require("TSA") == FALSE) {
  install.packages("TSA")
  library(TSA)
}
if (require("forecast") == FALSE) {
  install.packages("forecast")
  library(forecast)
}
if (require("Metrics") == FALSE) {
  install.packages("Metrics")
  library(Metrics)
}


The downloaded binary packages are in
	/var/folders/f_/fcfn37vd6dxf0sq0ljvgz2kh0000gn/T//RtmpeTsOGL/downloaded_packages


Loading required package: Metrics
trying URL 'https://cran.uib.no/bin/macosx/big-sur-x86_64/contrib/4.3/Metrics_0.1.4.tgz'
Content type 'application/x-gzip' length 82668 bytes (80 KB)
downloaded 80 KB


Attaching package: ‘Metrics’

The following object is masked from ‘package:forecast’:

    accuracy

In library(package, lib.loc = lib.loc, character.only = TRUE, logical.return = TRUE,  :
  there is no package called ‘Metrics’


In [76]:
%%R
# Load data
hub_prices <- list(
  nbp = read.csv("../data/nbp_close.csv"),
  peg = read.csv("../data/peg_close.csv"),
  the = read.csv("../data/the_close.csv"),
  ttf = read.csv("../data/ttf_close.csv"),
  ztp = read.csv("../data/ztp_close.csv")
)

In [77]:
%%R
# Create a date index with a 5 day week to align data
start_date <- as.Date("2018-10-01")
end_date <- as.Date("2024-08-30")
date_seq <- seq.Date(start_date, end_date, by = "day")
date_index <- date_seq[!weekdays(date_seq) %in% c("Saturday", "Sunday")]

In [78]:
%%R
# Merge the hub data with the created date index so that all data is aligned and fill in missing values
hub_prices <- lapply(hub_prices, function(df) {
  df$Date <- as.Date(df$Date)
  df <- merge(data.frame(Date = date_index), df, by = "Date", all.x = TRUE)
  df <- df %>%
    mutate(CLOSE = na.approx(CLOSE, rule = 2))
  return(df)
})

In [79]:
%%R
# Load the prices processed pricing data for each hub
nbp_price <- hub_prices$nbp
peg_price <- hub_prices$peg
the_price <- hub_prices$the
ttf_price <- hub_prices$ttf
ztp_price <- hub_prices$ztp

In [80]:
%%R
# Function to store the residuals of an OLS model to be used in VAR
store_ols_residuals <- function(hub1, hub2, file_name) {
  ols_model <- lm(hub1$CLOSE ~ hub2$CLOSE)
  residuals_df <- data.frame(Date = hub1$Date, Residuals = ols_model$residuals)
  write.csv(residuals_df, file_name, row.names = FALSE)
  return(ols_model)
}
folder_path <- "intermediate_storage/"
ztp_ttf_ols <- store_ols_residuals(ztp_price, ttf_price, paste(folder_path, "ztp_ttf_residuals.csv", sep = "")) # paste() concatenates strings

In [89]:
%%R
# Calculate log returns for each hub
hub_returns <- lapply(hub_prices, function(df) {
  df <- df %>%
    mutate(Return = log(CLOSE) - lag(log(CLOSE))) %>%  # Calculate log returns
    slice(-1)  # Drop the first row
  return(df)
})

In [90]:
%%R
nbp_returns <- hub_returns$nbp
peg_returns <- hub_returns$peg
the_returns <- hub_returns$the
ttf_returns <- hub_returns$ttf
ztp_returns <- hub_returns$ztp

In [203]:
%%R
# Create a function to fit an ARMA with exogenous variables model
armax_model <- function(hub1, hub2_lag1, ar_order, ma_order) {
  armax_model <- arima(hub1$Return, order = c(ar_order, 0, ma_order), xreg = hub2_lag1$Return, optim.control = list(maxit = 1000))
return(armax_model)
}

In [204]:
%%R
expanding_window_armax_forecast <- function(hub1, hub2, ar_order, ma_order, window_size = 10) {
  
  n <- length(hub1$Return)
  
  start_points <- seq(n - 250, n - window_size, by = window_size)
  performance <- data.frame(Interval = numeric(), MAE = numeric(), RMSE = numeric())

  hub1 <- hub1 %>% slice(-1) # Drop the first row
  hub2_lag1 <- lag(hub2) %>% slice(-1) # Drop the first row
  
  for (start in start_points) {
    
    hub1_train <- hub1[1:(start - 1), ]  # Use data up to the window for hub1
    hub2_lag1_train <- hub2_lag1[1:(start - 1), ]  # Use data up to the window for hub2
    
    hub1_actual <- hub1$Return[start:(start + window_size - 1)]

    armax_fit <- armax_model(hub1_train, hub2_lag1_train, ar_order, ma_order)
      
    # Fit the ARMA(3,3) model on hub2
    hub2_arma <- arima(hub2_lag1_train$Return, order = c(3, 0, 3),  optim.control = list(maxit = 1000))  # Fit ARMA(3,3) on hub2
    hub2_future_forecast <- predict(hub2_arma, n.ahead = window_size)$pred  # Forecast hub2 for the next window
    forecasted_values <- predict(armax_fit, newxreg = hub2_future_forecast, n.ahead = window_size)$pred
    
    mae_value <- mae(hub1_actual, forecasted_values)
    rmse_value <- rmse(hub1_actual, forecasted_values)
    performance <- rbind(performance, data.frame(Interval = start, MAE = mae_value, RMSE = rmse_value))
  }
  
  return(performance)
}

In [205]:
%%R
performance_metrics <- expanding_window_armax_forecast(ztp_returns, ttf_returns, ar_order = 3, ma_order = 1, window_size = 10)
print(performance_metrics)

   Interval        MAE       RMSE
1      1294 0.04846589 0.05824988
2      1304 0.07350453 0.08351720
3      1314 0.02525421 0.03763769
4      1324 0.02853729 0.03795740
5      1334 0.01947162 0.02469868
6      1344 0.04398321 0.04773487
7      1354 0.04288081 0.04935471
8      1364 0.03215316 0.03865814
9      1374 0.02311205 0.03482855
10     1384 0.02790288 0.03249579
11     1394 0.02037971 0.02532583
12     1404 0.02851127 0.03184716
13     1414 0.03112927 0.03615746
14     1424 0.03476898 0.04106284
15     1434 0.03022829 0.03668812
16     1444 0.03986259 0.04316754
17     1454 0.02535570 0.02717345
18     1464 0.02304191 0.02663087
19     1474 0.02941773 0.03483940
20     1484 0.01919158 0.02167219
21     1494 0.01671938 0.01977047
22     1504 0.02333681 0.02555207
23     1514 0.02305477 0.02637328
24     1524 0.02192526 0.02608865
25     1534 0.01720407 0.02247230
