# Statistical models in R
This notebook is covering:
1. Comparing models using MAE and RMSE as well as performing Diebold-Mariano

In [33]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go

In [34]:
# rpy2 is a Python package that allows you to run R code from Python
%pip install rpy2

Note: you may need to restart the kernel to use updated packages.


In [35]:
# Load the rpy2 extension to use R in Jupyter
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


The magic function `%%R` is used for running R code in Jupyter

In [37]:
%%R

# Define the hub pairs and models
hub_pairs <- list(c("ttf", "the"),c("ttf", "nbp"), c("the", "nbp"))
models <- c("naive", "arima", "vecm", "tvecm_long_t1", "ann")
horizon <- 250
window_size <- 5

source("compare_models.r")

results_list <- compare_error_metrics(hub_pairs, models, horizon, window_size)



$ttf_the
$ttf_the$ttf
               mape  rmse rmse_return da_return dv_return rmse_spread da_spread
naive          6.1  2.96        8.05         0~     -617~      0.642         0~
arima         6.11  2.97        8.08        52     -1.56       0.719      49.2 
vecm          6.09  2.92        7.93      57.2       153       0.554      55.6 
tvecm_long_t1 5.95  2.92        7.83      58.8       212       0.519~     60.4 
ann           5.51~ 2.85~       7.68~     60.4       155       0.842      57.6 
              corr_spread dv_spread
naive                 NA     -33.9~
arima             -0.032~     2.99 
vecm               0.532      14.8 
tvecm_long_t1      0.591      16.8 
ann                0.408      15.5 

$ttf_the$the
               mape  rmse rmse_return da_return dv_return rmse_spread da_spread
naive         6.02  2.94        7.93       0.4~     -609~      0.642         0~
arima         6.03  2.95        7.97      51.6      24.5       0.719      49.2 
vecm          6.04   2.9    

1: In cor(actual_spread, predicted_spread, method = "pearson") :
  the standard deviation is zero
2: In cor(actual_spread, predicted_spread, method = "pearson") :
  the standard deviation is zero
3: In cor(actual_spread, predicted_spread, method = "pearson") :
  the standard deviation is zero
4: In min(as.numeric(df[[metric]]), na.rm = TRUE) :
  no non-missing arguments to min; returning Inf
5: In min(as.numeric(df[[metric]]), na.rm = TRUE) :
  no non-missing arguments to min; returning Inf
6: In min(as.numeric(df[[metric]]), na.rm = TRUE) :
  no non-missing arguments to min; returning Inf


In [19]:
%%R
source("compare_models.r")
diebold_mariano(hub_pairs, models, horizon, window_size)

$ttf_the
$ttf_the$ttf
                 naive    arima     vecm tvecm_long_t1 tvecm_long_t2      ann
naive             -    0.339    0.536         0.626         0.007**  0.986   
arima         0.661        -    0.555         0.648         0.005**  0.988   
vecm          0.464    0.445        -         0.623         0.038*   0.990   
tvecm_long_t1 0.374    0.352    0.377             -         0.021*   0.929   
tvecm_long_t2 0.993    0.995    0.962         0.979             -    0.999   
ann           0.014*   0.012*   0.010*        0.071         0.001***     -   

$ttf_the$the
                 naive    arima     vecm tvecm_long_t1 tvecm_long_t2      ann
naive             -    0.414    0.521         0.566         0.013*   0.967   
arima         0.586        -    0.532         0.580         0.010**  0.971   
vecm          0.479    0.468        -         0.562         0.058    0.994   
tvecm_long_t1 0.434    0.420    0.438             -         0.037*   0.938   
tvecm_long_t2 0.987    0.990

In [20]:
%%R
diebold_mariano_spread(hub_pairs, models, horizon, window_size)

$ttf_the
$ttf_the$ttf
                 naive    arima     vecm tvecm_long_t1 tvecm_long_t2      ann
naive             -    0.339    0.536         0.626         0.007**  0.986   
arima         0.661        -    0.555         0.648         0.005**  0.988   
vecm          0.464    0.445        -         0.623         0.038*   0.990   
tvecm_long_t1 0.374    0.352    0.377             -         0.021*   0.929   
tvecm_long_t2 0.993    0.995    0.962         0.979             -    0.999   
ann           0.014*   0.012*   0.010*        0.071         0.001***     -   

$ttf_the$the
                 naive    arima     vecm tvecm_long_t1 tvecm_long_t2      ann
naive             -    0.414    0.521         0.566         0.013*   0.967   
arima         0.586        -    0.532         0.580         0.010**  0.971   
vecm          0.479    0.468        -         0.562         0.058    0.994   
tvecm_long_t1 0.434    0.420    0.438             -         0.037*   0.938   
tvecm_long_t2 0.987    0.990

In [6]:
%%R

hub1_name <- "ttf"
hub2_name <- "the"
model1_name <- "tvecm_long_t1"
model2_name <- "vecm"
horizon <- 250
window_size <- 5

model1_filename <- paste0("../predictions/test/predictions/", hub1_name, "_", hub2_name, "_h", horizon, "_w", window_size, "_", model1_name, "_predictions.csv")
model1_predictions <- read.csv(model1_filename)

#model2_filename <- paste0("../predictions/test/predictions/", hub1_name, "_h", horizon, "_w", window_size, "_", model2_name, "_predictions.csv")
#model2_predictions <- read.csv(model2_filename)

model2_filename <- paste0("../predictions/test/predictions/", hub1_name, "_", hub2_name, "_h", horizon, "_w", window_size, "_", model2_name, "_predictions.csv")
model2_predictions <- read.csv(model2_filename)

actuals_hub1_filename <- paste0("../data/interpolated/", hub1_name, "_close_interpolated.csv")
actuals_hub1 <- tail(read.csv(actuals_hub1_filename), horizon)
colnames(actuals_hub1) <- c("Date", hub1_name)

actuals_hub2_filename <- paste0("../data/interpolated/", hub2_name, "_close_interpolated.csv")
actuals_hub2 <- tail(read.csv(actuals_hub2_filename), horizon)
colnames(actuals_hub2) <- c("Date", hub2_name)

model1_mape_hub1 <- mape(actuals_hub1[[hub1_name]], model1_predictions[[hub1_name]])*100
model1_rmse_hub1 <- rmse(actuals_hub1[[hub1_name]], model1_predictions[[hub1_name]])

model2_mape_hub1 <- mape(actuals_hub1[[hub1_name]], model2_predictions[[hub1_name]])*100
model2_rmse_hub1 <- rmse(actuals_hub1[[hub1_name]], model2_predictions[[hub1_name]])

print(paste0("MAPE for ", hub1_name, ": ", model1_mape_hub1))
print(paste0("RMSE for ", hub1_name, ": ", model1_rmse_hub1))

print(paste0("MAPE for ", hub1_name, ": ", model2_mape_hub1))
print(paste0("RMSE for ", hub1_name, ": ", model2_rmse_hub1))

[1] "MAPE for ttf: 5.95078067443684"
[1] "RMSE for ttf: 2.92374370973872"
[1] "MAPE for ttf: 6.09504827799185"
[1] "RMSE for ttf: 2.91780895245976"


In [7]:
%%R

model1_mape_hub2 <- mape(actuals_hub2[[hub2_name]], model1_predictions[[hub2_name]])*100
model1_rmse_hub2 <- rmse(actuals_hub2[[hub2_name]], model1_predictions[[hub2_name]])

model2_mape_hub2 <- mape(actuals_hub2[[hub2_name]], model2_predictions[[hub2_name]])*100
model2_rmse_hub2 <- rmse(actuals_hub2[[hub2_name]], model2_predictions[[hub2_name]])

print(paste0("MAPE for ", hub2_name, ": ", model1_mape_hub2))
print(paste0("RMSE for ", hub2_name, ": ", model1_rmse_hub2))

print(paste0("MAPE for ", hub2_name, ": ", model2_mape_hub2))
print(paste0("RMSE for ", hub2_name, ": ", model2_rmse_hub2))

[1] "MAPE for the: 5.91787470613681"
[1] "RMSE for the: 2.91450453591841"
[1] "MAPE for the: 6.03477430668957"
[1] "RMSE for the: 2.90185965738467"


In [8]:
%%R
model1_hub1_resids <- actuals_hub1[[hub1_name]] - model1_predictions[[hub1_name]]
model2_hub1_resids <- actuals_hub1[[hub1_name]] - model2_predictions[[hub1_name]]
hub1_dm <- dm.test(model1_hub1_resids, model2_hub1_resids, h = 5, power = 1, alternative = "less")
hub1_dm


	Diebold-Mariano Test

data:  model1_hub1_residsmodel2_hub1_resids
DM = -0.31266, Forecast horizon = 5, Loss function power = 1, p-value =
0.3774
alternative hypothesis: less



In [9]:
%%R
head(model2_hub1_resids)

[1]  0.190307  2.733733  1.237164  5.067614  3.821259 10.077344
