# Statistical models in R
This notebook is covering:
1. Data preprocessing:
    1. Aligning all dataframe to 5-day week from 2018-10-01 to 2024-08-30.
    2. Interpolating missing values.
2. ARMA-models.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

In [2]:
# rpy2 is a Python package that allows you to run R code from Python
%pip install rpy2

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the rpy2 extension to use R in Jupyter
%load_ext rpy2.ipython

The magic function `%%R` is used for running R code in Jupyter

In [4]:
%%R
# Install required packages
if (require("dplyr") == FALSE) {
  install.packages("dplyr")
  library(dplyr)
}
if (require("zoo") == FALSE) {
  install.packages("zoo")
  library(zoo)
}
if (require("psych") == FALSE) {
  install.packages("psych")
  library(psych)
}
if (require("TSA") == FALSE) {
  install.packages("TSA")
  library(TSA)
}
if (require("forecast") == FALSE) {
  install.packages("forecast")
  library(forecast)
}
if (require("Metrics") == FALSE) {
  install.packages("Metrics")
  library(Metrics)
}
if (require("ggplot2") == FALSE) {
  install.packages("ggplot2")
  library(ggplot2)
}
if (require("vars") == FALSE) {
  install.packages("vars")
  library(vars)
}
if (require("svars") == FALSE) {
  install.packages("svars")
  library(svars)
}
if (require("tsDyn") == FALSE) {
  install.packages("tsDyn")
  library(tsDyn)
}


Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: psych
Loading required package: TSA

Attaching package: ‘TSA’

The following objects are masked from ‘package:stats’:

    acf, arima

The following object is masked from ‘package:utils’:

    tar

Loading required package: forecast
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Registered S3 methods overwritten by 'forecast':
  method       from
  fitted.Arima TSA 
  plot.Arima   TSA 
Loading required package: Metrics

Attaching package: ‘Metrics’

The following object is masked from ‘package:forecast’:

    accuracy

Loading required package: 

In [5]:
%%R
# Load data
hub_prices <- list(
  nbp = read.csv("../../data/interpolated/nbp_close_interpolated.csv"),
  peg = read.csv("../../data/interpolated/peg_close_interpolated.csv"),
  the = read.csv("../../data/interpolated/the_close_interpolated.csv"),
  ttf = read.csv("../../data/interpolated/ttf_close_interpolated.csv"),
  ztp = read.csv("../../data/interpolated/ztp_close_interpolated.csv")
)

In [6]:
%%R
# Load the prices processed pricing data for each hub
nbp_price <- hub_prices$nbp
peg_price <- hub_prices$peg
the_price <- hub_prices$the
ttf_price <- hub_prices$ttf
ztp_price <- hub_prices$ztp

In [7]:
%%R
hub1 <- the_price
hub2 <- nbp_price

hubs <- data.frame(hub1 = hub1$CLOSE, hub2 = hub2$CLOSE)
var_p <- VAR(hubs, type = "const", lag.max = 20, ic = "SC")

var_p$p

SC(n) 
    5 


In [8]:
%%R
vecm_predictions <- function(hubs, window_size = 5, horizon = 250) {
  
  # Initialize empty data frames for storing predictions and actual values
  predictions <- data.frame(matrix(ncol = ncol(hubs), nrow = 0))
  actuals <- data.frame(matrix(ncol = ncol(hubs), nrow = 0))
  
  colnames(predictions) <- colnames(hubs)
  colnames(actuals) <- colnames(hubs)
  
  for (i in 1:horizon) {
    train_size <- nrow(hubs) - horizon - window_size + i
    hub_train <- hubs[1:train_size, ]
    
    # Fit the VECM model
    vecm <- VECM(hub_train, lag = 2, r = 1, include = "both", estim = "ML")
    
    # Predict the future values
    hub_forecast <- predict(vecm, n.ahead = window_size)
    
    hub_prediction <- hub_forecast[window_size, , drop = FALSE]
    
    hub_actual <- hubs[train_size + window_size, ]
    
    predictions <- rbind(predictions, hub_prediction)
    actuals <- rbind(actuals, hub_actual)
  }
  
  # Return both data frames as a list
  return(list(predictions = predictions, actuals = actuals))
}
  

In [9]:
%%R
vecm_output <- vecm_predictions(hubs, window_size = 5, horizon = 250)
hub1_predictions <- vecm_output$predictions$hub1
hub1_actuals <- vecm_output$actuals$hub1
hub2_predictions <- vecm_output$predictions$hub2
hub2_actuals <- vecm_output$actuals$hub2

hub1_mae <- mae(hub1_actuals, hub1_predictions)
hub2_mae <- mae(hub2_actuals, hub2_predictions)

hub1_rmse <- rmse(hub1_actuals, hub1_predictions)
hub2_rmse <- rmse(hub2_actuals, hub2_predictions)

print(paste0("Hub1: Mean Absolute Error: ", hub1_mae))
print(paste0("Hub1: Root Mean Squared Error: ", hub1_rmse))

print(paste0("Hub2: Mean Absolute Error: ", hub2_mae))
print(paste0("Hub2: Root Mean Squared Error: ", hub2_rmse))

[1] "Hub1: Mean Absolute Error: 2.0670589525671"
[1] "Hub1: Root Mean Squared Error: 2.71339321109405"
[1] "Hub2: Mean Absolute Error: 2.18863991694114"
[1] "Hub2: Root Mean Squared Error: 2.95114231020944"


In [10]:
%%R
hub1_forecast_comparison <- data.frame(
  Actual = hub1_actuals,
  Forecasted = hub1_predictions
)

In [11]:
%R -o hub1_forecast_comparison

hub1_forecast_comparison = hub1_forecast_comparison

actual_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title='VECM Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()


In [12]:
%%R
hub2_forecast_comparison <- data.frame(
  Actual = hub2_actuals,
  Forecasted = hub2_predictions
)

In [13]:
%R -o hub2_forecast_comparison

hub2_forecast_comparison = hub2_forecast_comparison

actual_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title='VECM Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()


In [19]:
%%R
window_size <- 5
horizon <- 250

hubs_lag <- lag(hubs, window_size)


hubs_prediction <- tail(hubs_lag, n = horizon)
hubs_actual <- tail(hubs, n = horizon)


hub1_naive_predictions <- hubs_prediction$hub1
hub1_naive_actuals <- hubs_actual$hub1
hub2_naive_predictions <- hubs_prediction$hub2
hub2_naive_actuals <- hubs_actual$hub2

hub1_naive_mae <- mae(hub1_naive_actuals, hub1_naive_predictions)
hub2_naive_mae <- mae(hub2_naive_actuals, hub2_naive_predictions)

hub1_naive_rmse <- rmse(hub1_naive_actuals, hub1_naive_predictions)
hub2_naive_rmse <- rmse(hub2_naive_actuals, hub2_naive_predictions)

print(paste0("Hub1: Mean Absolute Error: ", hub1_naive_mae))
print(paste0("Hub1: Root Mean Squared Error: ", hub1_naive_rmse))

print(paste0("Hub2: Mean Absolute Error: ", hub2_naive_mae))
print(paste0("Hub2: Root Mean Squared Error: ", hub2_naive_rmse))

[1] "Hub1: Mean Absolute Error: 2.08176266666667"
[1] "Hub1: Root Mean Squared Error: 2.73317708716025"
[1] "Hub2: Mean Absolute Error: 2.23126705883691"
[1] "Hub2: Root Mean Squared Error: 3.08543650003223"


In [20]:
%%R
naive_resids <- hub1_naive_actuals - hub1_naive_predictions
vecm_resids <- hub1_actuals - hub1_predictions
dm.test(vecm_resids, naive_resids, h = window_size, power = 1, alternative = "greater")


	Diebold-Mariano Test

data:  vecm_residsnaive_resids
DM = -0.65126, Forecast horizon = 5, Loss function power = 1, p-value =
0.7423
alternative hypothesis: greater



In [21]:
%%R
naive_resids <- hub2_naive_actuals - hub2_naive_predictions
vecm_resids <- hub2_actuals - hub2_predictions
dm.test(vecm_resids, naive_resids, h = window_size, power = 1, alternative = "greater")


	Diebold-Mariano Test

data:  vecm_residsnaive_resids
DM = -0.59988, Forecast horizon = 5, Loss function power = 1, p-value =
0.7254
alternative hypothesis: greater



In [22]:
%%R
hub1_forecast_comparison <- data.frame(
  Actual = hub1_actuals,
  Forecasted = hub1_predictions
)

In [23]:
%R -o hub1_forecast_comparison

actual_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub1_forecast_comparison.index,
    y=hub1_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title='Naïve Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()

In [24]:
%%R
hub2_forecast_comparison <- data.frame(
  Actual = hub1_actuals,
  Forecasted = hub1_predictions
)

In [25]:
%R -o hub2_forecast_comparison

actual_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub2_forecast_comparison.index,
    y=hub2_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title='Naïve Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()