# Statistical models in R
This notebook is covering:
1. ARIMA-models.

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

In [3]:
# rpy2 is a Python package that allows you to run R code from Python
%pip install rpy2

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Load the rpy2 extension to use R in Jupyter
%load_ext rpy2.ipython

The magic function `%%R` is used for running R code in Jupyter

In [5]:
%%R
# Install required packages
if (require("dplyr") == FALSE) {
  install.packages("dplyr")
  library(dplyr)
}
if (require("zoo") == FALSE) {
  install.packages("zoo")
  library(zoo)
}
if (require("psych") == FALSE) {
  install.packages("psych")
  library(psych)
}
if (require("TSA") == FALSE) {
  install.packages("TSA")
  library(TSA)
}
if (require("forecast") == FALSE) {
  install.packages("forecast")
  library(forecast)
}
if (require("Metrics") == FALSE) {
  install.packages("Metrics")
  library(Metrics)
}
if (require("ggplot2") == FALSE) {
  install.packages("ggplot2")
  library(ggplot2)
}
if (require("vars") == FALSE) {
  install.packages("vars")
  library(vars)
}
if (require("svars") == FALSE) {
  install.packages("svars")
  library(svars)
}
if (require("tsDyn") == FALSE) {
  install.packages("tsDyn")
  library(tsDyn)
}


Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: psych
Loading required package: TSA

Attaching package: ‘TSA’

The following objects are masked from ‘package:stats’:

    acf, arima

The following object is masked from ‘package:utils’:

    tar

Loading required package: forecast
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Registered S3 methods overwritten by 'forecast':
  method       from
  fitted.Arima TSA 
  plot.Arima   TSA 
Loading required package: Metrics

Attaching package: ‘Metrics’

The following object is masked from ‘package:forecast’:

    accuracy

Loading required package: 

In [7]:
%%R
# Load data
hub_prices <- list(
  nbp = read.csv("../../data/interpolated/nbp_close_interpolated.csv"),
  peg = read.csv("../../data/interpolated/peg_close_interpolated.csv"),
  the = read.csv("../../data/interpolated/the_close_interpolated.csv"),
  ttf = read.csv("../../data/interpolated/ttf_close_interpolated.csv"),
  ztp = read.csv("../../data/interpolated/ztp_close_interpolated.csv")
)

In [24]:
%%R

hub1_name <- "nbp"

hub1 <- hub_prices[[hub1_name]]

hub <- data.frame(hub = hub1$CLOSE)


In [19]:
%%R
arima_predictions <- function(hub, window_size = 5, horizon = 250) {
  
  # Initialize empty data frames for storing predictions and actual values
  predictions <- data.frame(matrix(ncol = ncol(hub), nrow = 0))
  actuals <- data.frame(matrix(ncol = ncol(hub), nrow = 0))
  
  colnames(predictions) <- colnames(hub)
  colnames(actuals) <- colnames(hub)
  
  for (i in 1:horizon) {
    train_size <- nrow(hub) - horizon - window_size + i
    hub_train <- hub[1:train_size, ]
    
    # Fit the VECM model
    arima_model <- arima(hub_train, order = c(3, 1, 3))
    
    # Predict the future values
    hub_forecast <- predict(arima_model, n.ahead = window_size)$pred
    
    hub_prediction <- hub_forecast[window_size]
    
    prediction_data <- data.frame(hub_prediction)
    colnames(prediction_data) <- colnames(hub)
    
    hub_actual <- hub[train_size + window_size, ]
    actual_data <- data.frame(hub_actual)
    colnames(actual_data) <- colnames(hub)
    
    predictions <- rbind(predictions, prediction_data)
    actuals <- rbind(actuals, actual_data)
  }
  
  # Return both data frames as a list
  return(list(predictions = predictions, actuals = actuals))
}
  

In [20]:
%%R
output <- arima_predictions(hub)
hub_predictions <- output$predictions$hub
hub_actuals <- output$actuals$hub


hub_mae <- mae(hub_actuals, hub_predictions)
hub_rmse <- rmse(hub_actuals, hub_predictions)

print(paste0(hub_name, ": Mean Absolute Error: ", hub_mae))
print(paste0(hub_name, ": Root Mean Squared Error: ", hub_rmse))

[1] "nbp: Mean Absolute Error: 2.23807459091554"
[1] "nbp: Root Mean Squared Error: 3.0877651956768"


In stats::arima(x = x, order = order, seasonal = seasonal, xreg = xreg,  :
  possible convergence problem: optim gave code = 1


In [21]:
%%R
predictions <- output$predictions
actuals <- output$actuals

colnames(predictions) <- c(hub_name)
colnames(actuals) <- c(hub_name)

prediction_dates = tail(hub1$Date, horizon)
predictions <- cbind(data.frame(Date = prediction_dates), predictions)
actuals <- cbind(data.frame(Date = prediction_dates), actuals)

In [25]:
%%R
write.csv(predictions, paste0("../../predictions/",hub1_name, "_h", horizon, "_w", window_size, "arima_predictions.csv"), row.names = FALSE)
write.csv(actuals, paste0("../../predictions/",hub1_name, "_h", horizon, "_w", window_size, "_actuals.csv"), row.names = FALSE)

In [12]:
%%R
hub_forecast_comparison <- data.frame(
  Actual = hub_actuals,
  Forecasted = hub_predictions
)

In [14]:
%R -o hub_forecast_comparison
%R -o hub_name

hub_forecast_comparison = hub_forecast_comparison

actual_trace = go.Scatter(
    x=hub_forecast_comparison.index,
    y=hub_forecast_comparison['Actual'],
    mode='lines',
    name='Actual Prices'
)

forecasted_trace = go.Scatter(
    x=hub_forecast_comparison.index,
    y=hub_forecast_comparison['Forecasted'],
    mode='lines',
    name='Forecasted Prices'
)

# Create the plot
layout = go.Layout(
    title=hub_name.item()+ ': ARIMA Actual vs Forecasted Prices',
    xaxis=dict(title='Index'),
    yaxis=dict(title='Price'),
)

fig = go.Figure(data=[actual_trace, forecasted_trace], layout=layout)
fig.show()


In [15]:
%%R
window_size <- 5
horizon <- 250

hub_lag <- lag(hub, window_size)


hub_prediction <- tail(hub_lag, n = horizon)
hub_actual <- tail(hub, n = horizon)

hub_naive_predictions <- hub_prediction$hub
hub_naive_actuals <- hub_actual$hub

hub_naive_mae <- mae(hub_naive_actuals, hub_naive_predictions)
hub_naive_rmse <- rmse(hub_naive_actuals, hub_naive_predictions)

print(paste0(hub_name, ": Mean Absolute Error: ", hub_naive_mae))
print(paste0(hub_name, ": Mean Squared Error: ", hub_naive_rmse))

[1] "nbp: Mean Absolute Error: 2.23126705883691"
[1] "nbp: Mean Squared Error: 3.08543650003223"


In [16]:
%%R
naive_resids <- hub_naive_actuals - hub_naive_predictions
arima_resids <- hub_actuals - hub_predictions
print(paste0(hub_name, ": Diebold Mariano Test"))
dm.test(arima_resids, naive_resids, h = window_size, power = 1, alternative = "greater")

[1] "nbp: Diebold Mariano Test"

	Diebold-Mariano Test

data:  arima_residsnaive_resids
DM = 0.77795, Forecast horizon = 5, Loss function power = 1, p-value =
0.2187
alternative hypothesis: greater

