/
dynamic_factor.py
555 lines (498 loc) · 22.5 KB
/
dynamic_factor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Implements DynamicFactor Model as interface to statsmodels."""
import numpy as np
import pandas as pd
from sktime.forecasting.base.adapters import _StatsModelsAdapter
__all__ = ["DynamicFactor"]
__author__ = ["Ris-Bali", "lbventura"]
class DynamicFactor(_StatsModelsAdapter):
"""Dynamic Factor Forecaster.
Direct interface for ``statsmodels.tsa.statespace.dynamic_factor``
Parameters
----------
k_factors : int
The number of unobserved factors.
factor_order : int
The order of vector autoregression followed by factors.
error_cov_type : {'scalar','diagonal','unstructured'} ,default = 'diagonal'
The structure of the covariance matrix of the observation error term, where
"unstructured" puts no restrictions on the matrix, "diagonal" requires it
to be any diagonal matrix (uncorrelated errors), and "scalar" requires it
to be a scalar times the identity matrix.
error_order : int , default = 0
The order of the vector autoregression followed by the observation error
component. Default is None, corresponding to white noise errors.
error_var : bool , default = False , optional
Whether or not to model the errors jointly via a vector autoregression,
rather than as individual autoregression.
enforce_stationarity : bool default = True
Whether or not to model the AR parameters to enforce stationarity in the
autoregressive component of the model.
start_params :array_like ,default = None
Initial guess of the solution for the loglikelihood maximization.
transformed : bool, default = True
Whether or not start_params is already transformed.
includes_fixed : bool , default = False
If parameters were previously fixed with the fix_params method, this argument
describes whether or not start_params also includes the fixed parameters,
in addition to the free parameters.
cov_type : {'opg','oim','approx','robust','robust_approx','none'},default = 'opg'
'opg' for the outer product of gradient estimator
'oim' for the observed information matrix estimator, calculated
using the method of Harvey (1989)
'approx' for the observed information matrix estimator, calculated using
a numerical approximation of the Hessian matrix.
'robust' for an approximate (quasi-maximum likelihood) covariance matrix
that may be valid even in the presence of some misspecifications.
Intermediate calculations use the 'oim' method.
'robust_approx' is the same as 'robust' except that the intermediate
calculations use the 'approx' method.
'none' for no covariance matrix calculation
cov_kwds :dict or None , default = None
'approx_complex_step' : bool, optional - If True, numerical approximations are
computed using complex-step methods. If False, numerical approximations are
computed using finite difference methods. Default is True.
'approx_centered' : bool, optional - If True, numerical approximations computed
using finite difference methods use a centered approximation. Default is False.
method : str , 'lbfgs'
'newton' for Newton-Raphson
'nm' for Nelder-Mead
'bfgs' for Broyden-Fletcher-Goldfarb-Shanno (BFGS)
'lbfgs' for limited-memory BFGS with optional box constraints
'powell' for modified Powell's method
'cg' for conjugate gradient
'ncg' for Newton-conjugate gradient
'basinhopping' for global basin-hopping solver
maxiter : int , optional ,default = 50
The maximum number of iterations to perform.
full_output : bool , default = 1
Set to True to have all available output in the
Results object's mle_retvals attribute.
The output is dependent on the solver.
disp : bool , default = 5
Set to True to print convergence messages.
callback : callable callback(xk) , default = None
Called after each iteration, as callback(xk),
where xk is the current parameter vector.
return_params : bool ,default = False
Whether or not to return only the array of maximizing parameters.
optim_score : {'harvey','approx'} , default = None
The method by which the score vector is calculated. 'harvey' uses the method
from Harvey (1989), 'approx' uses either finite difference or
complex step differentiation depending upon the value of optim_complex_step,
and None uses the built-in gradient approximation of the optimizer.
optim_complex_step : bool , default = True
Whether or not to use complex step differentiation
when approximating the score; if False, finite difference approximation is used.
optim_hessian : {'opg','oim','approx'} , default = None
'opg' uses outer product of gradients, 'oim' uses the information
matrix formula from Harvey (1989), and 'approx' uses numerical approximation.
low_memory : bool , default = False
If set to True, techniques are applied to substantially reduce memory usage.
If used, some features of the results object will not be available
(including smoothed results and in-sample prediction),
although out-of-sample forecasting is possible.
See Also
--------
statsmodels.tsa.statespace.dynamic_factor.DynamicFactor
statsmodels.tsa.statespace.dynamic_factor.DynamicFactorResults
References
----------
.. [1] Lütkepohl, Helmut. 2007. New Introduction to Multiple Time Series Analysis.
Berlin: Springer.
Examples
--------
>>> from sktime.utils._testing.series import _make_series
>>> from sktime.forecasting.dynamic_factor import DynamicFactor
>>> y = _make_series(n_columns=4)
>>> forecaster = DynamicFactor() # doctest: +SKIP
>>> forecaster.fit(y) # doctest: +SKIP
DynamicFactor(...)
>>> y_pred = forecaster.predict(fh=[1,2,3]) # doctest: +SKIP
"""
_tags = {
# packaging info
# --------------
"authors": ["Ris-Bali", "lbventura"],
"maintainers": ["Ris-Bali", "lbventura"],
# python_dependencies: "statsmodels" - inherited from _StatsModelsAdapter
# estimator type
# --------------
"scitype:y": "multivariate",
"ignores-exogeneous-X": False,
"handles-missing-data": True,
"y_inner_mtype": "pd.DataFrame",
"X_inner_mtype": "pd.DataFrame",
"requires-fh-in-fit": False,
"X-y-must-have-same-index": True,
"enforce_index_type": None,
"capability:insample": False,
"capability:pred_int": True,
"capability:pred_int:insample": True,
}
def __init__(
self,
k_factors=1,
factor_order=1,
error_cov_type="diagonal",
error_order=0,
error_var=False,
enforce_stationarity=True,
start_params=None,
transformed=True,
includes_fixed=False,
cov_type=None,
cov_kwds=None,
method="lbfgs",
maxiter=50,
full_output=False,
disp=False,
callback=None,
return_params=False,
optim_score=None,
optim_complex_step=None,
optim_hessian=None,
flags=None,
low_memory=False,
):
# Model Params
self.k_factors = k_factors
self.factor_order = factor_order
self.error_cov_type = error_cov_type
self.error_order = error_order
self.error_var = error_var
self.enforce_stationarity = enforce_stationarity
# Fit Params
self.start_params = start_params
self.transformed = transformed
self.includes_fixed = includes_fixed
self.cov_type = cov_type
self.cov_kwds = cov_kwds
self.method = method
self.maxiter = maxiter
self.full_output = full_output
self.disp = disp
self.callback = callback
self.return_params = return_params
self.optim_score = optim_score
self.optim_complex_step = optim_complex_step
self.optim_hessian = optim_hessian
self.flags = flags
self.low_memory = low_memory
super().__init__()
def _predict(self, fh, X):
"""Make forecasts.
Parameters
----------
fh : ForecastingHorizon
The forecasters horizon with the steps ahead to to predict.
Default is one-step ahead forecast,
i.e. np.array([1])
X : pd.DataFrame, optional (default=None)
Exogenous variables are ignored.
Returns
-------
y_pred : pd.Series
Returns series of predicted values.
"""
# statsmodels requires zero-based indexing starting at the
# beginning of the training series when passing integers
start, end = fh.to_absolute_int(self._y.index[0], self.cutoff)[[0, -1]]
y_pred = self._fitted_forecaster.predict(start=start, end=end, exog=X)
# statsmodels forecasts all periods from start to end of forecasting
# horizon, but only return given time points in forecasting horizon
if "int" in (self._y.index[0]).__class__.__name__: # Rather fishy solution
y_pred.index = np.arange(
start + self._y.index[0], end + self._y.index[0] + 1
)
return y_pred.loc[fh.to_absolute_index(self.cutoff)]
def _predict_interval(self, fh, X, coverage):
"""Compute/return prediction quantiles for a forecast.
private _predict_interval containing the core logic,
called from predict_interval and possibly predict_quantiles
State required:
Requires state to be "fitted".
Accesses in self:
Fitted model attributes ending in "_"
self.cutoff
Parameters
----------
fh : guaranteed to be ForecastingHorizon
The forecasting horizon with the steps ahead to to predict.
X : optional (default=None)
guaranteed to be of a type in self.get_tag("X_inner_mtype")
Exogeneous time series for the forecast
coverage : list of float (guaranteed not None and floats in [0,1] interval)
nominal coverage(s) of predictive interval(s)
Returns
-------
pred_int : pd.DataFrame
Column has multi-index: first level is variable name from y in fit,
second level coverage fractions for which intervals were computed.
in the same order as in input ``coverage``.
Third level is string "lower" or "upper", for lower/upper interval end.
Row index is fh, with additional (upper) levels equal to instance levels,
from y seen in fit, if y_inner_mtype is Panel or Hierarchical.
Entries are forecasts of lower/upper interval end,
for var in col index, at nominal coverage in second col index,
lower/upper depending on third col index, for the row index.
Upper/lower interval end forecasts are equivalent to
quantile forecasts at alpha = 0.5 - c/2, 0.5 + c/2 for c in coverage.
"""
if type(coverage) is not list:
coverage_list = [coverage]
else:
coverage_list = coverage
start, end = fh.to_absolute_int(self._y.index[0], self.cutoff)[[0, -1]]
steps = end - len(self._y) + 1
ix = fh.to_indexer(self.cutoff)
model = self._fitted_forecaster
df_list = []
# generate the forecasts for each alpha/coverage
for coverage in coverage_list:
alpha = 1 - coverage
y_pred = model.get_forecast(steps=steps, exog=X).conf_int(alpha=alpha)
y_pred = y_pred.iloc[ix]
y_pred.rename(
columns={orig_col: orig_col + f" {coverage}" for orig_col in y_pred},
inplace=True,
)
df_list.append(y_pred)
# concatenate the predictions for different values of alpha
predictions_df = pd.concat(df_list, axis=1)
# all the code below is just boilerplate for getting the correct columns
mod_var_list = list(map(lambda x: str(x).replace(" ", ""), model.data.ynames))
rename_list = [
var_name + " " + str(coverage) + " " + bound
for coverage in coverage_list
for bound in ["lower", "upper"]
for var_name in mod_var_list
]
predictions_df.columns = rename_list
final_ord = [
var_name + " " + str(coverage) + " " + bound
for var_name in mod_var_list
for coverage in coverage_list
for bound in ["lower", "upper"]
]
predictions_df = predictions_df[final_ord]
cols = [col_name.split(" ") for col_name in predictions_df.columns]
predictions_df_2 = pd.DataFrame(
predictions_df.values, columns=pd.MultiIndex.from_tuples(cols)
)
final_columns = [
[col_name, float(coverage), bound]
for col_name in model.data.ynames
for coverage in predictions_df_2.columns.get_level_values(1).unique()
for bound in predictions_df_2.columns.get_level_values(2).unique()
]
predictions_df_3 = pd.DataFrame(
predictions_df_2.values, columns=pd.MultiIndex.from_tuples(final_columns)
)
predictions_df_3.index = fh.to_absolute_index(self.cutoff)
return predictions_df_3
def _fit_forecaster(self, y, X=None):
"""Fit to training data.
Parameters
----------
y:pd.Series
Target time series to which forecaster is fit.
X:pd.DataFrame , optional (default=None)
Exogenous variables
"""
from statsmodels.tsa.statespace.dynamic_factor import (
DynamicFactor as _DynamicFactor,
)
self._forecaster = _DynamicFactor(
endog=y,
exog=X,
k_factors=self.k_factors,
factor_order=self.factor_order,
error_order=self.error_order,
error_var=self.error_var,
error_cov_type=self.error_cov_type,
enforce_stationarity=self.enforce_stationarity,
)
self._fitted_forecaster = self._forecaster.fit(
start_params=self.start_params,
transformed=self.transformed,
includes_fixed=self.includes_fixed,
cov_type=self.cov_type,
cov_kwds=self.cov_kwds,
method=self.method,
maxiter=self.maxiter,
full_output=self.full_output,
disp=self.disp,
callback=self.callback,
return_params=self.return_params,
optim_score=self.optim_score,
optim_complex_step=self.optim_complex_step,
optim_hessian=self.optim_hessian,
flags=self.flags,
low_memory=self.low_memory,
)
def summary(self):
"""Get a summary of the fitted forecaster."""
return self._fitted_forecaster.summary()
def simulate(
self,
nsimulations,
measurement_shocks=None,
state_shocks=None,
initial_state=None,
anchor=None,
repetitions=None,
X=None,
extend_model=None,
extend_kwargs=None,
transformed=True,
includes_fixed=False,
**kwargs,
):
r"""Simulate a new time series following the state space model.
Taken from original statsmodels implementation.
Parameters
----------
nsimulations : int
The number of observations to simulate. If the model is time-invariant
this can be any number. If the model is time-varying, then this number
must be less than or equal to the number.
measurement_shocks : array_like , optional
If specified, these are the shocks to the measurement equation,.
If unspecified, these are automatically generated using a
pseudo-random number generator. If specified, must be shaped
nsimulations x k_endog, where k_endog is the same as in
the state space model.
state_shocks : array_like , optional
If specified, these are the shocks to the state equation, .
If unspecified, these are automatically generated using a
pseudo-random number generator. If specified, must be shaped
nsimulations x k_posdef where k_posdef is the same as in the
state space model.
initial_state : array_like , optional
If specified, this is the initial state vector to use in
simulation,which should be shaped (k_states x 1), where
k_states is the same as in the state space model.
If unspecified, but the model has been initialized,
then that initialization is used.
This must be specified if anchor is anything
other than "start" or 0.
anchor : int,str,or datetime , optional
Starting point from which to begin the simulations; type depends
on the index of the given endog model.
Two special cases are the strings 'start' and 'end',
which refer to starting at the beginning and end of the sample,
respectively. If a date/time index was provided to the model,
then this argument can be a date string to parse or a datetime type.
Otherwise, an integer index should be given. Default is 'start'.
repetitions : int , optional
Number of simulated paths to generate. Default is 1 simulated path
Returns
-------
simulated_obs : ndarray
An array of simulated observations. If repetitions=None, then it will
be shaped (nsimulations x k_endog) or (nsimulations,) if k_endog=1.
Otherwise it will be shaped (nsimulations x k_endog x repetitions).
If the model was given Pandas input then the output will be a Pandas object.
If k_endog > 1 and repetitions is not None, then the output will be a Pandas
DataFrame that has a MultiIndex for the columns, with the first level
containing the names of the endog variables and the second level
containing the repetition number.
"""
return self._fitted_forecaster.simulate(
nsimulations=nsimulations,
measurement_shocks=measurement_shocks,
state_shocks=state_shocks,
initial_state=initial_state,
anchor=anchor,
repetitions=repetitions,
exog=X,
extend_model=extend_model,
extend_kwargs=extend_kwargs,
transformed=transformed,
includes_fixed=includes_fixed,
**kwargs,
)
def plot_diagnostics(
self,
variable=0,
lags=10,
fig=None,
figsize=None,
truncate_endog_names=24,
auto_ylims=False,
bartlett_confint=False,
acf_kwargs=None,
):
"""Diagnostic plots for standardized residuals of one endogenous variable.
Parameters
----------
variable : int , optional
Index of the endogenous variable for which the diagnostic
plots should be created. Default is 0.
lags : int , optional
Number of lags to include in the correlogram. Default is 10.
fig : Figure , optional
If given, subplots are created in this figure instead of in
a new figure. Note that the 2x2 grid will be created in the
provided figure using fig.add_subplot().
figsize : tuple , optional
If a figure is created, this argument allows specifying a size.
The tuple is (width, height).
auto_ylims : bool , optional
If True, adjusts automatically the y-axis limits to ACF values.
bartlett_confint : bool , default = True
Confidence intervals for ACF values are generally placed at 2
standard errors around r_k. The formula used for standard error
depends upon the situation. If the autocorrelations are being
used to test for randomness of residuals as part of the ARIMA routine,
the standard errors are determined assuming the residuals are white noise.
The approximate formula for any lag is that standard error
of each r_k = 1/sqrt(N).For the ACF of raw data, the standard error at
a lag k is found as if the right model was an MA(k-1).
This allows the possible interpretation that if all autocorrelations
past a certain lag are within the limits,
the model might be an MA of order defined by
the last significant autocorrelation.
In this case, a moving average model is assumed for the data
and the standard errors for the confidence intervals should be generated
using Bartlett's formula.
acf_kwargs : dict , optional
Optional dictionary of keyword arguments that are directly
passed on to the correlogram Matplotlib plot produced by plot_acf().
Returns
-------
Figure
Figure instance with diagnostic plots.
"""
self._fitted_forecaster.plot_diagonistics(
variable=variable,
lags=lags,
fig=fig,
figsize=figsize,
truncate_endog_names=truncate_endog_names,
auto_ylims=auto_ylims,
bartlett_confint=bartlett_confint,
acf_kwargs=acf_kwargs,
)
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str , default = "default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return ``"default"`` set.
There are currently no reserved values for forecasters.
Returns
-------
params :dict or list of dict , default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
instance.
``create_test_instance`` uses the first (or only) dictionary in ``params``
"""
params1 = {"k_factors": 1, "factor_order": 1}
params2 = {"maxiter": 25, "low_memory": True}
return [params1, params2]