/
tscv.R
245 lines (233 loc) · 8.71 KB
/
tscv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# Time series cross-validation
# y is a time series
# forecastfunction must return an object of class forecast
# h is number of steps ahead to forecast
# ... are passed to forecastfunction
#' Time series cross-validation
#'
#' \code{tsCV} computes the forecast errors obtained by applying
#' \code{forecastfunction} to subsets of the time series \code{y} using a
#' rolling forecast origin.
#'
#' Let \code{y} contain the time series \eqn{y_1,\dots,y_T}{y[1:T]}. Then
#' \code{forecastfunction} is applied successively to the time series
#' \eqn{y_1,\dots,y_t}{y[1:t]}, for \eqn{t=1,\dots,T-h}, making predictions
#' \eqn{\hat{y}_{t+h|t}}{f[t+h]}. The errors are given by \eqn{e_{t+h} =
#' y_{t+h}-\hat{y}_{t+h|t}}{e[t+h] = y[t+h]-f[t+h]}. If h=1, these are returned as a
#' vector, \eqn{e_1,\dots,e_T}{e[1:T]}. For h>1, they are returned as a matrix with
#' the hth column containing errors for forecast horizon h.
#' The first few errors may be missing as
#' it may not be possible to apply \code{forecastfunction} to very short time
#' series.
#'
#' @param y Univariate time series
#' @param forecastfunction Function to return an object of class
#' \code{forecast}. Its first argument must be a univariate time series, and it
#' must have an argument \code{h} for the forecast horizon. If exogenous predictors are used,
#' then it must also have \code{xreg} and \code{newxreg} arguments corresponding to the
#' training and test periods.
#' @param h Forecast horizon
#' @param window Length of the rolling window, if NULL, a rolling window will not be used.
#' @param xreg Exogeneous predictor variables passed to the forecast function if required.
#' @param initial Initial period of the time series where no cross-validation is performed.
#' @param ... Other arguments are passed to \code{forecastfunction}.
#' @return Numerical time series object containing the forecast errors as a vector (if h=1)
#' and a matrix otherwise. The time index corresponds to the last period of the training
#' data. The columns correspond to the forecast horizons.
#' @author Rob J Hyndman
#' @seealso \link{CV}, \link{CVar}, \link{residuals.Arima}, \url{https://robjhyndman.com/hyndsight/tscv/}.
#'
#' @keywords ts
#' @examples
#'
#' #Fit an AR(2) model to each rolling origin subset
#' far2 <- function(x, h){forecast(Arima(x, order=c(2,0,0)), h=h)}
#' e <- tsCV(lynx, far2, h=1)
#'
#' #Fit the same model with a rolling window of length 30
#' e <- tsCV(lynx, far2, h=1, window=30)
#'
#' #Example with exogenous predictors
#' far2_xreg <- function(x, h, xreg, newxreg) {
#' forecast(Arima(x, order=c(2,0,0), xreg=xreg), xreg=newxreg)
#' }
#'
#' y <- ts(rnorm(50))
#' xreg <- matrix(rnorm(100),ncol=2)
#' e <- tsCV(y, far2_xreg, h=3, xreg=xreg)
#'
#' @export
tsCV <- function(y, forecastfunction, h=1, window=NULL, xreg=NULL, initial=0, ...) {
y <- as.ts(y)
n <- length(y)
e <- ts(matrix(NA_real_, nrow = n, ncol = h))
if(initial >= n) stop("initial period too long")
tsp(e) <- tsp(y)
if (!is.null(xreg)) {
# Make xreg a ts object to allow easy subsetting later
xreg <- ts(as.matrix(xreg))
if(NROW(xreg) != length(y))
stop("xreg must be of the same size as y")
# Pad xreg with NAs
xreg <- ts(rbind(xreg, matrix(NA, nrow=h, ncol=NCOL(xreg))),
start = start(y),
frequency = frequency(y))
}
if (is.null(window))
indx <- seq(1+initial, n - 1L)
else
indx <- seq(window+initial, n - 1L, by = 1L)
for (i in indx) {
y_subset <- subset(
y,
start = ifelse(is.null(window), 1L,
ifelse(i - window >= 0L, i - window + 1L, stop("small window"))),
end = i)
if (is.null(xreg)) {
fc <- try(suppressWarnings(
forecastfunction(y_subset, h = h, ...)
), silent = TRUE)
}
else {
xreg_subset <- subset(
xreg,
start = ifelse(is.null(window), 1L,
ifelse(i - window >= 0L, i - window + 1L, stop("small window"))),
end = i)
xreg_future <- subset(
xreg,
start = i+1,
end = i+h)
fc <- try(suppressWarnings(
forecastfunction(y_subset, h = h, xreg = xreg_subset, newxreg=xreg_future, ...)
), silent = TRUE)
}
if (!is.element("try-error", class(fc))) {
e[i, ] <- y[i + seq(h)] - fc$mean[seq(h)]
}
}
if (h == 1) {
return(e[, 1L])
} else {
colnames(e) <- paste("h=", 1:h, sep = "")
return(e)
}
}
# Cross-validation for AR models
# By Gabriel Caceres
## Note arguments to pass must be named
#' k-fold Cross-Validation applied to an autoregressive model
#'
#' \code{CVar} computes the errors obtained by applying an autoregressive
#' modelling function to subsets of the time series \code{y} using k-fold
#' cross-validation as described in Bergmeir, Hyndman and Koo (2015). It also
#' applies a Ljung-Box test to the residuals. If this test is significant
#' (see returned pvalue), there is serial correlation in the residuals and the
#' model can be considered to be underfitting the data. In this case, the
#' cross-validated errors can underestimate the generalization error and should
#' not be used.
#'
#' @aliases print.CVar
#'
#' @param y Univariate time series
#' @param k Number of folds to use for cross-validation.
#' @param FUN Function to fit an autoregressive model. Currently, it only works
#' with the \code{\link{nnetar}} function.
#' @param cvtrace Provide progress information.
#' @param blocked choose folds randomly or as blocks?
#' @param LBlags lags for the Ljung-Box test, defaults to 24, for yearly series can be set to 20
#' @param ... Other arguments are passed to \code{FUN}.
#' @return A list containing information about the model and accuracy for each
#' fold, plus other summary information computed across folds.
#' @author Gabriel Caceres and Rob J Hyndman
#' @seealso \link{CV}, \link{tsCV}.
#' @references Bergmeir, C., Hyndman, R.J., Koo, B. (2018) A note on the
#' validity of cross-validation for evaluating time series prediction.
#' \emph{Computational Statistics & Data Analysis}, \bold{120}, 70-83.
#' \url{https://robjhyndman.com/publications/cv-time-series/}.
#' @keywords ts
#' @examples
#'
#' modelcv <- CVar(lynx, k=5, lambda=0.15)
#' print(modelcv)
#' print(modelcv$fold1)
#'
#' library(ggplot2)
#' autoplot(lynx, series="Data") +
#' autolayer(modelcv$testfit, series="Fits") +
#' autolayer(modelcv$residuals, series="Residuals")
#' ggAcf(modelcv$residuals)
#'
#' @export
CVar <- function(y, k=10, FUN=nnetar, cvtrace=FALSE, blocked=FALSE, LBlags=24, ...) {
nx <- length(y)
# n-folds at most equal number of points
k <- min(as.integer(k), nx)
if (k <= 1L) {
stop("k must be at least 2")
}
# Set up folds
ind <- seq_len(nx)
fold <- if (blocked) {
sort(rep(1:k, length.out = nx))
} else {
sample(rep(1:k, length.out = nx))
}
cvacc <- matrix(NA_real_, nrow = k, ncol = 7)
out <- list()
alltestfit <- rep(NA, length.out = nx)
for (i in 1:k) {
out[[paste0("fold", i)]] <- list()
testset <- ind[fold == i]
trainset <- ind[fold != i]
trainmodel <- FUN(y, subset = trainset, ...)
testmodel <- FUN(y, model = trainmodel, xreg = trainmodel$xreg)
testfit <- fitted(testmodel)
acc <- accuracy(y, testfit, test = testset)
cvacc[i, ] <- acc
out[[paste0("fold", i)]]$model <- trainmodel
out[[paste0("fold", i)]]$accuracy <- acc
out[[paste0("fold", i)]]$testfit <- testfit
out[[paste0("fold", i)]]$testset <- testset
alltestfit[testset] <- testfit[testset]
if (isTRUE(cvtrace)) {
cat("Fold", i, "\n")
print(acc)
cat("\n")
}
}
out$testfit <- ts(alltestfit)
tsp(out$testfit) <- tsp(y)
out$residuals <- out$testfit - y
out$LBpvalue <- Box.test(out$residuals, type = "Ljung", lag = LBlags)$p.value
out$k <- k
# calculate mean accuracy accross all folds
CVmean <- matrix(apply(cvacc, 2, FUN = mean, na.rm = TRUE),
dimnames = list(colnames(acc), "Mean"))
# calculate accuracy sd accross all folds --- include?
CVsd <- matrix(apply(cvacc, 2, FUN = sd, na.rm = TRUE),
dimnames = list(colnames(acc), "SD"))
out$CVsummary <- cbind(CVmean, CVsd)
out$series <- deparse(substitute(y))
out$call <- match.call()
return(structure(out, class = c("CVar", class(trainmodel))))
}
#' @export
print.CVar <- function(x, ...) {
cat("Series:", x$series, "\n")
cat("Call: ")
print(x$call)
# Add info about series, function, and parameters
# Add note about any NA/NaN in folds?
#
# Print number of folds
cat("\n", x$k, "-fold cross-validation\n", sep = "")
# Print mean & sd accuracy() results
print(x$CVsummary)
cat("\n")
cat("p-value of Ljung-Box test of residuals is ", x$LBpvalue, "\n")
cat("if this value is significant (<0.05),\n")
cat("the result of the cross-validation should not be used\n")
cat("as the model is underfitting the data.\n")
invisible(x)
}