# Statistical models in R
This notebook is covering:
1. Data preprocessing:
    1. Aligning all dataframe to 5-day week from 2018-10-01 to 2024-08-30.
    2. Interpolating missing values.
2. ARMA-models.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

In [2]:
# rpy2 is a Python package that allows you to run R code from Python
%pip install rpy2

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the rpy2 extension to use R in Jupyter
%load_ext rpy2.ipython

The magic function `%%R` is used for running R code in Jupyter

In [4]:
%%R
# Install required packages
if (require("dplyr") == FALSE) {
  install.packages("dplyr")
  library(dplyr)
}
if (require("zoo") == FALSE) {
  install.packages("zoo")
  library(zoo)
}
if (require("psych") == FALSE) {
  install.packages("psych")
  library(psych)
}
if (require("TSA") == FALSE) {
  install.packages("TSA")
  library(TSA)
}
if (require("forecast") == FALSE) {
  install.packages("forecast")
  library(forecast)
}
if (require("Metrics") == FALSE) {
  install.packages("Metrics")
  library(Metrics)
}
if (require("ggplot2") == FALSE) {
  install.packages("ggplot2")
  library(ggplot2)
}
if (require("vars") == FALSE) {
  install.packages("vars")
  library(vars)
}
if (require("svars") == FALSE) {
  install.packages("svars")
  library(svars)
}
if (require("tsDyn") == FALSE) {
  install.packages("tsDyn")
  library(tsDyn)
}


Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: psych
Loading required package: TSA

Attaching package: ‘TSA’

The following objects are masked from ‘package:stats’:

    acf, arima

The following object is masked from ‘package:utils’:

    tar

Loading required package: forecast
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Registered S3 methods overwritten by 'forecast':
  method       from
  fitted.Arima TSA 
  plot.Arima   TSA 
Loading required package: Metrics

Attaching package: ‘Metrics’

The following object is masked from ‘package:forecast’:

    accuracy

Loading required package: 

In [23]:
%%R
# Load data
hub_prices <- list(
  nbp = read.csv("../../data/interpolated/nbp_close_interpolated.csv"),
  peg = read.csv("../../data/interpolated/peg_close_interpolated.csv"),
  the = read.csv("../../data/interpolated/the_close_interpolated.csv"),
  ttf = read.csv("../../data/interpolated/ttf_close_interpolated.csv"),
  ztp = read.csv("../../data/interpolated/ztp_close_interpolated.csv")
)

In [89]:
%%R


hub1_name <- "ttf"
hub2_name <- "the"

hub1 <- hub_prices[[hub1_name]]
hub2 <- hub_prices[[hub2_name]]

hubs <- data.frame(hub1 = hub1$CLOSE, hub2 = hub2$CLOSE)
var_p <- VAR(hubs, type = "const", lag.max = 20, ic = "SC")

var_p$p

SC(n) 
    3 


In [90]:
%%R
vecm_predictions <- function(hubs, window_size = 5, horizon = 250) {
  
  # Initialize empty data frames for storing predictions and actual values
  predictions <- data.frame(matrix(ncol = ncol(hubs), nrow = 0))
  actuals <- data.frame(matrix(ncol = ncol(hubs), nrow = 0))
  
  colnames(predictions) <- colnames(hubs)
  colnames(actuals) <- colnames(hubs)
  
  for (i in 1:horizon) {
    train_size <- nrow(hubs) - horizon - window_size + i
    hub_train <- hubs[1:train_size, ]
    
    # Fit the VECM model
    vecm <- VECM(hub_train, lag = 2, r = 1, include = "both", estim = "ML")
    
    # Predict the future values
    hub_forecast <- predict(vecm, n.ahead = window_size)
    
    hub_prediction <- hub_forecast[window_size, , drop = FALSE]
    
    hub_actual <- hubs[train_size + window_size, ]
    
    predictions <- rbind(predictions, hub_prediction)
    actuals <- rbind(actuals, hub_actual)
  }
  
  # Return both data frames as a list
  return(list(predictions = predictions, actuals = actuals))
}
  

In [91]:
%%R
horizon <- 250
window_size <- 5
vecm_output <- vecm_predictions(hubs, window_size = window_size, horizon = horizon)
hub1_predictions <- vecm_output$predictions$hub1
hub1_actuals <- vecm_output$actuals$hub1
hub2_predictions <- vecm_output$predictions$hub2
hub2_actuals <- vecm_output$actuals$hub2

hub1_mae <- mae(hub1_actuals, hub1_predictions)
hub2_mae <- mae(hub2_actuals, hub2_predictions)

hub1_rmse <- rmse(hub1_actuals, hub1_predictions)
hub2_rmse <- rmse(hub2_actuals, hub2_predictions)
print(paste0("Pair: ", hub1_name, " | ", hub2_name))
print(paste0("Window Size: ", window_size))
print(paste0("Horizon: ", horizon))
print(paste0(hub1_name,": Mean Absolute Error: ", hub1_mae))
print(paste0(hub1_name,": Root Mean Squared Error: ", hub1_rmse))

print(paste0(hub2_name,": Mean Absolute Error: ", hub2_mae))
print(paste0(hub2_name,": Root Mean Squared Error: ", hub2_rmse))

[1] "Pair: ttf | the"
[1] "Window Size: 5"
[1] "Horizon: 250"
[1] "ttf: Mean Absolute Error: 2.15439601058386"
[1] "ttf: Root Mean Squared Error: 3.00820138609321"
[1] "the: Mean Absolute Error: 2.15991921119847"
[1] "the: Root Mean Squared Error: 2.86994087281757"


In [92]:
%%R
predictions <- vecm_output$predictions
actuals <- vecm_output$actuals

colnames(predictions) <- c(hub1_name, hub2_name)
colnames(actuals) <- c(hub1_name, hub2_name)

In [93]:
%%R

prediction_dates = tail(hub1$Date, horizon)

predictions <- cbind(data.frame(Date = prediction_dates), predictions)

actuals <- cbind(data.frame(Date = prediction_dates), actuals)

In [94]:
%%R
write.csv(predictions, paste0("../../predictions/",hub1_name,"_", hub2_name, "_h", horizon, "_w", window_size, "_vecm_predictions.csv"), row.names = FALSE)
write.csv(actuals, paste0("../../predictions/",hub1_name,"_", hub2_name, "_h", horizon, "_w", window_size, "_actuals.csv"), row.names = FALSE)

In [95]:
%%R
hub1_forecast_comparison <- data.frame(
  Actual = hub1_actuals,
  Forecasted = hub1_predictions
)

In [96]:
%R -o hub1_forecast_comparison
%R -o hub1_name

hub1_forecast_comparison = hub1_forecast_comparison

actual_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title= hub1_name.item() + ': VECM Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()


In [97]:
%%R
hub2_forecast_comparison <- data.frame(
  Actual = hub2_actuals,
  Forecasted = hub2_predictions
)

In [98]:
%R -o hub2_forecast_comparison
%R -o hub2_name

hub2_forecast_comparison = hub2_forecast_comparison

actual_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title= hub2_name.item() + ': VECM Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()


In [99]:
%%R
window_size <- 5
horizon <- 250

hubs_lag <- lag(hubs, window_size)


hubs_prediction <- tail(hubs_lag, n = horizon)
hubs_actual <- tail(hubs, n = horizon)


hub1_naive_predictions <- hubs_prediction$hub1
hub1_naive_actuals <- hubs_actual$hub1
hub2_naive_predictions <- hubs_prediction$hub2
hub2_naive_actuals <- hubs_actual$hub2

hub1_naive_mae <- mae(hub1_naive_actuals, hub1_naive_predictions)
hub2_naive_mae <- mae(hub2_naive_actuals, hub2_naive_predictions)

hub1_naive_rmse <- rmse(hub1_naive_actuals, hub1_naive_predictions)
hub2_naive_rmse <- rmse(hub2_naive_actuals, hub2_naive_predictions)
print(paste0("Pair: ", hub1_name, " | ", hub2_name))
print(paste0(hub1_name, ": Mean Absolute Error: ", hub1_naive_mae))
print(paste0(hub1_name, ": Mean Squared Error: ", hub1_naive_rmse))

print(paste0(hub2_name, ": Mean Absolute Error: ", hub2_naive_mae))
print(paste0(hub2_name, ": Root Mean Squared Error: ", hub2_naive_rmse))

[1] "Pair: ttf | the"
[1] "ttf: Mean Absolute Error: 2.13959333333333"
[1] "ttf: Mean Squared Error: 2.96090183926752"
[1] "the: Mean Absolute Error: 2.08176266666667"
[1] "the: Root Mean Squared Error: 2.73317708716025"


In [100]:
%%R
naive_predictions <- hubs_prediction
naive_actuals <- hubs_actual

colnames(naive_predictions) <- c(hub1_name, hub2_name)
colnames(naive_actuals) <- c(hub1_name, hub2_name)

prediction_dates = tail(hub1$Date, horizon)

naive_predictions <- cbind(data.frame(Date = prediction_dates), naive_predictions)

naive_actuals <- cbind(data.frame(Date = prediction_dates), naive_actuals)

In [101]:
%%R
write.csv(naive_predictions, paste0("../../predictions/",hub1_name,"_", hub2_name, "_h", horizon, "_w", window_size, "_naive_predictions.csv"), row.names = FALSE)

In [102]:
%%R
write.csv(naive_predictions, paste0("../../predictions/",hub1_name,"_", hub2_name, "_h", horizon, "_w", window_size, "_last_available_data.csv"), row.names = FALSE)

In [103]:
%%R
naive_resids <- hub1_naive_actuals - hub1_naive_predictions
vecm_resids <- hub1_actuals - hub1_predictions
print(paste0("Pair: ", hub1_name, " | ", hub2_name))
print(paste0(hub1_name,": Diebold-Mariano Test:"))
dm.test(vecm_resids, naive_resids, h = window_size, power = 1, alternative = "greater")

[1] "Pair: ttf | the"
[1] "ttf: Diebold-Mariano Test:"

	Diebold-Mariano Test

data:  vecm_residsnaive_resids
DM = 0.31455, Forecast horizon = 5, Loss function power = 1, p-value =
0.3767
alternative hypothesis: greater



In [104]:
%%R
naive_resids <- hub2_naive_actuals - hub2_naive_predictions
vecm_resids <- hub2_actuals - hub2_predictions
print(paste0("Pair: ", hub1_name, " | ", hub2_name))
print(paste0(hub2_name,": Diebold-Mariano Test:"))
dm.test(vecm_resids, naive_resids, h = window_size, power = 1, alternative = "greater")

[1] "Pair: ttf | the"
[1] "the: Diebold-Mariano Test:"

	Diebold-Mariano Test

data:  vecm_residsnaive_resids
DM = 1.7196, Forecast horizon = 5, Loss function power = 1, p-value =
0.04338
alternative hypothesis: greater



In [105]:
%%R
hub1_forecast_comparison <- data.frame(
  Actual = hub1_actuals,
  Forecasted = hub1_predictions
)

In [106]:
%R -o hub1_forecast_comparison
%R -o hub1_name

actual_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title= hub1_name.item() + ': Naïve Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()

In [107]:
%%R
hub2_forecast_comparison <- data.frame(
  Actual = hub1_actuals,
  Forecasted = hub1_predictions
)

In [108]:
%R -o hub2_forecast_comparison
%R -o hub2_name

actual_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title= hub2_name.item() + ': Naïve Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()