# Statistical models in R
This notebook is covering:
1. Comparing models using MAE and RMSE as well as performing Diebold-Mariano

In [42]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

In [43]:
# rpy2 is a Python package that allows you to run R code from Python
%pip install rpy2

Note: you may need to restart the kernel to use updated packages.


In [44]:
# Load the rpy2 extension to use R in Jupyter
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


The magic function `%%R` is used for running R code in Jupyter

In [45]:
%%R

# Define the hub pairs and models
hub_pairs <- list(c("ttf", "the"),c("ttf", "nbp"), c("the", "nbp"))
models <- c("naive", "arima", "vecm", "tvecm_long_t1", "tvecm_long_t2", "ann")
horizon <- 250
window_size <- 5

source("compare_models.r")

results_list <- compare_error_metrics(hub_pairs, models, horizon, window_size)



$ttf_the
$ttf_the$ttf
               mape  rmse
naive          6.1  2.96 
arima         6.11  2.97 
vecm           6.1  2.92~
tvecm_long_t1 5.95  2.92~
tvecm_long_t2 6.76   3.2 
ann           5.92~ 3.02 

$ttf_the$the
               mape  rmse
naive         6.02  2.94 
arima         6.03  2.95 
vecm          6.03   2.9~
tvecm_long_t1 5.92  2.91 
tvecm_long_t2 6.63  3.16 
ann           5.82~ 2.96 


$ttf_nbp
$ttf_nbp$ttf
               mape  rmse
naive          6.1  2.96 
arima         6.11  2.97 
vecm          5.91~ 2.94~
tvecm_long_t1 6.49  3.04 
tvecm_long_t2 6.41  3.06 
ann           6.23  2.99 

$ttf_nbp$nbp
               mape  rmse
naive          6.5  3.09 
arima         6.51  3.08 
vecm           6.2~ 3.03~
tvecm_long_t1 6.53  3.06 
tvecm_long_t2 6.97  3.23 
ann           6.62  3.12 


$the_nbp
$the_nbp$the
               mape  rmse
naive         6.02  2.94 
arima         6.03  2.95 
vecm          5.86~ 2.93~
tvecm_long_t1 6.44  3.04 
tvecm_long_t2 6.45  3.07 
ann           6.13

In [46]:
%%R
source("compare_models.r")
diebold_mariano(hub_pairs, models, horizon, window_size)

$ttf_the
$ttf_the$ttf
                 naive    arima     vecm tvecm_long_t1 tvecm_long_t2      ann
naive             -    0.339    0.536         0.626         0.007**  0.713   
arima         0.661        -    0.555         0.648         0.005**  0.730   
vecm          0.464    0.445        -         0.623         0.038*   0.641   
tvecm_long_t1 0.374    0.352    0.377             -         0.021*   0.565   
tvecm_long_t2 0.993    0.995    0.962         0.979             -    0.993   
ann           0.287    0.270    0.359         0.435         0.007**      -   

$ttf_the$the
                 naive    arima     vecm tvecm_long_t1 tvecm_long_t2      ann
naive             -    0.414    0.521         0.566         0.013*   0.791   
arima         0.586        -    0.532         0.580         0.010**  0.799   
vecm          0.479    0.468        -         0.562         0.058    0.721   
tvecm_long_t1 0.434    0.420    0.438             -         0.037*   0.668   
tvecm_long_t2 0.987    0.990

In [47]:
%%R

hub1_name <- "ttf"
hub2_name <- "the"
model1_name <- "tvecm_long_t1"
model2_name <- "vecm"
horizon <- 250
window_size <- 5

model1_filename <- paste0("../predictions/test/predictions/", hub1_name, "_", hub2_name, "_h", horizon, "_w", window_size, "_", model1_name, "_predictions.csv")
model1_predictions <- read.csv(model1_filename)

#model2_filename <- paste0("../predictions/test/predictions/", hub1_name, "_h", horizon, "_w", window_size, "_", model2_name, "_predictions.csv")
#model2_predictions <- read.csv(model2_filename)

model2_filename <- paste0("../predictions/test/predictions/", hub1_name, "_", hub2_name, "_h", horizon, "_w", window_size, "_", model2_name, "_predictions.csv")
model2_predictions <- read.csv(model2_filename)

actuals_hub1_filename <- paste0("../data/interpolated/", hub1_name, "_close_interpolated.csv")
actuals_hub1 <- tail(read.csv(actuals_hub1_filename), horizon)
colnames(actuals_hub1) <- c("Date", hub1_name)

actuals_hub2_filename <- paste0("../data/interpolated/", hub2_name, "_close_interpolated.csv")
actuals_hub2 <- tail(read.csv(actuals_hub2_filename), horizon)
colnames(actuals_hub2) <- c("Date", hub2_name)

model1_mape_hub1 <- mape(actuals_hub1[[hub1_name]], model1_predictions[[hub1_name]])*100
model1_rmse_hub1 <- rmse(actuals_hub1[[hub1_name]], model1_predictions[[hub1_name]])

model2_mape_hub1 <- mape(actuals_hub1[[hub1_name]], model2_predictions[[hub1_name]])*100
model2_rmse_hub1 <- rmse(actuals_hub1[[hub1_name]], model2_predictions[[hub1_name]])

print(paste0("MAPE for ", hub1_name, ": ", model1_mape_hub1))
print(paste0("RMSE for ", hub1_name, ": ", model1_rmse_hub1))

print(paste0("MAPE for ", hub1_name, ": ", model2_mape_hub1))
print(paste0("RMSE for ", hub1_name, ": ", model2_rmse_hub1))

[1] "MAPE for ttf: 5.95078067443684"
[1] "RMSE for ttf: 2.92374370973872"
[1]

 "MAPE for ttf: 6.09504827799185"
[1] "RMSE for ttf: 2.91780895245976"


In [48]:
%%R

model1_mape_hub2 <- mape(actuals_hub2[[hub2_name]], model1_predictions[[hub2_name]])*100
model1_rmse_hub2 <- rmse(actuals_hub2[[hub2_name]], model1_predictions[[hub2_name]])

model2_mape_hub2 <- mape(actuals_hub2[[hub2_name]], model2_predictions[[hub2_name]])*100
model2_rmse_hub2 <- rmse(actuals_hub2[[hub2_name]], model2_predictions[[hub2_name]])

print(paste0("MAPE for ", hub2_name, ": ", model1_mape_hub2))
print(paste0("RMSE for ", hub2_name, ": ", model1_rmse_hub2))

print(paste0("MAPE for ", hub2_name, ": ", model2_mape_hub2))
print(paste0("RMSE for ", hub2_name, ": ", model2_rmse_hub2))

[1] "MAPE for the: 5.91787470613681"
[1] "RMSE for the: 2.91450453591841"
[1] "MAPE for the: 6.03477430668957"
[1] "RMSE for the: 2.90185965738467"


In [49]:
%%R
model1_hub1_resids <- actuals_hub1[[hub1_name]] - model1_predictions[[hub1_name]]
model2_hub1_resids <- actuals_hub1[[hub1_name]] - model2_predictions[[hub1_name]]
hub1_dm <- dm.test(model1_hub1_resids, model2_hub1_resids, h = 5, power = 1, alternative = "less")
hub1_dm


	Diebold-Mariano Test

data:  model1_hub1_residsmodel2_hub1_resids
DM = -0.31266, Forecast horizon = 5, Loss function power = 1, p-value =
0.3774
alternative hypothesis: less



In [50]:
%%R
head(model2_hub1_resids)

[1]  0.190307  2.733733  1.237164  5.067614  3.821259 10.077344
