/
structural.py
526 lines (485 loc) · 22.3 KB
/
structural.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
# !/usr/bin/env python3 -u
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Wraps the UnobservedComponents (state space) model from statsmodels."""
__all__ = ["UnobservedComponents"]
__author__ = ["juanitorduz"]
import pandas as pd
from sktime.forecasting.base.adapters import _StatsModelsAdapter
class UnobservedComponents(_StatsModelsAdapter):
r"""Wrapper class of the UnobservedComponents model from statsmodels.
Input parameters and doc-stringsare taken from the original implementation.
Parameters
----------
level : {bool, str}, optional
Whether or not to include a level component. Default is False. Can also
be a string specification of the level / trend component.
trend : bool, optional
Whether or not to include a trend component. Default is False. If True,
``level`` must also be True.
seasonal : {int, None}, optional
The period of the seasonal component, if any. Default is None.
freq_seasonal : {list[dict], None}, optional.
Whether (and how) to model seasonal component(s) with trig. functions.
If specified, there is one dictionary for each frequency-domain
seasonal component. Each dictionary must have the key, value pair for
'period' -- integer and may have a key, value pair for
'harmonics' -- integer. If 'harmonics' is not specified in any of the
dictionaries, it defaults to the floor of period/2.
cycle : bool, optional
Whether or not to include a cycle component. Default is False.
autoregressive : {int, None}, optional
The order of the autoregressive component. Default is None.
irregular : bool, optional
Whether or not to include an irregular component. Default is False.
stochastic_level : bool, optional
Whether or not any level component is stochastic. Default is False.
stochastic_trend : bool, optional
Whether or not any trend component is stochastic. Default is False.
stochastic_seasonal : bool, optional
Whether or not any seasonal component is stochastic. Default is True.
stochastic_freq_seasonal : list[bool], optional
Whether or not each seasonal component(s) is (are) stochastic. Default
is True for each component. The list should be of the same length as
freq_seasonal.
stochastic_cycle : bool, optional
Whether or not any cycle component is stochastic. Default is False.
damped_cycle : bool, optional
Whether or not the cycle component is damped. Default is False.
cycle_period_bounds : tuple, optional
A tuple with lower and upper allowed bounds for the period of the
cycle. If not provided, the following default bounds are used:
(1) if no date / time information is provided, the frequency is
constrained to be between zero and :math:`\pi`, so the period is
constrained to be in [0.5, infinity].
(2) If the date / time information is provided, the default bounds
allow the cyclical component to be between 1.5 and 12 years; depending
on the frequency of the endogenous variable, this will imply different
specific bounds.
mle_regression : bool, optional
Whether or not to estimate regression coefficients by maximum
likelihood as one of hyperparameters. Default is True.
If False, the regression coefficients are estimated by recursive OLS,
included in the state vector.
use_exact_diffuse : bool, optional
Whether or not to use exact diffuse initialization for non-stationary
states. Default is False (in which case approximate diffuse
initialization is used).
start_params : array_like, optional
Initial guess of the solution for the loglikelihood maximization.
transformed : bool, optional
Whether or not ``start_params`` is already transformed. Default is
True.
includes_fixed : bool, optional
If parameters were previously fixed with the ``fix_params`` method,
this argument describes whether or not ``start_params`` also includes
the fixed parameters, in addition to the free parameters. Default
is False.
cov_type : str, optional
The ``cov_type`` keyword governs the method for calculating the
covariance matrix of parameter estimates. Can be one of:
- 'opg' for the outer product of gradient estimator
- 'oim' for the observed information matrix estimator, calculated
using the method of Harvey (1989)
- 'approx' for the observed information matrix estimator,
calculated using a numerical approximation of the Hessian matrix.
- 'robust' for an approximate (quasi-maximum likelihood) covariance
matrix that may be valid even in the presence of some
misspecifications. Intermediate calculations use the 'oim'
method.
- 'robust_approx' is the same as 'robust' except that the
intermediate calculations use the 'approx' method.
- 'none' for no covariance matrix calculation.
Default is 'opg' unless memory conservation is used to avoid
computing the loglikelihood values for each observation, in which
case the default is 'approx'.
cov_kwds : dict or None, optional
A dictionary of arguments affecting covariance matrix computation.
**opg, oim, approx, robust, robust_approx**
- 'approx_complex_step' : bool, optional - If True, numerical
approximations are computed using complex-step methods. If False,
numerical approximations are computed using finite difference
methods. Default is True.
- 'approx_centered' : bool, optional - If True, numerical
approximations computed using finite difference methods use a
centered approximation. Default is False.
method : str, optional
The ``method`` determines which solver from ``scipy.optimize``
is used, and it can be chosen from among the following strings:
- 'newton' for Newton-Raphson
- 'nm' for Nelder-Mead
- 'bfgs' for Broyden-Fletcher-Goldfarb-Shanno (BFGS)
- 'lbfgs' for limited-memory BFGS with optional box constraints
- 'powell' for modified Powell's method
- 'cg' for conjugate gradient
- 'ncg' for Newton-conjugate gradient
- 'basinhopping' for global basin-hopping solver
The explicit arguments in ``fit`` are passed to the solver,
with the exception of the basin-hopping solver. Each
solver has several optional arguments that are not the same across
solvers. See the notes section below (or scipy.optimize) for the
available arguments and for the list of explicit arguments that the
basin-hopping solver supports.
maxiter : int, optional
The maximum number of iterations to perform.
full_output : bool, optional
Set to True to have all available output in the Results object's
mle_retvals attribute. The output is dependent on the solver.
See LikelihoodModelResults notes section for more information.
disp : bool, optional
Set to True to print convergence messages. Default is 0.
callback : callable callback(xk), optional
Called after each iteration, as callback(xk), where xk is the
current parameter vector.
return_params : bool, optional
Whether or not to return only the array of maximizing parameters.
Default is False.
optim_score : {'harvey', 'approx'} or None, optional
The method by which the score vector is calculated. 'harvey' uses
the method from Harvey (1989), 'approx' uses either finite
difference or complex step differentiation depending upon the
value of ``optim_complex_step``, and None uses the built-in gradient
approximation of the optimizer. Default is None. This keyword is
only relevant if the optimization method uses the score.
optim_complex_step : bool, optional
Whether or not to use complex step differentiation when
approximating the score; if False, finite difference approximation
is used. Default is True. This keyword is only relevant if
``optim_score`` is set to 'harvey' or 'approx'.
optim_hessian : {'opg','oim','approx'}, optional
The method by which the Hessian is numerically approximated. 'opg'
uses outer product of gradients, 'oim' uses the information
matrix formula from Harvey (1989), and 'approx' uses numerical
approximation. This keyword is only relevant if the
optimization method uses the Hessian matrix.
low_memory : bool, optional
If set to True, techniques are applied to substantially reduce
memory usage. If used, some features of the results object will
not be available (including smoothed results and in-sample
prediction), although out-of-sample forecasting is possible.
Default is False.
random_state : int, RandomState instance or None, optional ,
default=None – If int, random_state is the seed used by the random
number generator; If RandomState instance, random_state is the random
number generator; If None, the random number generator is the
RandomState instance used by np.random.
See Also
--------
statsmodels.tsa.statespace.structural.UnobservedComponents
statsmodels.tsa.statespace.structural.UnobservedComponentsResults
References
----------
.. [1] Seabold, Skipper, and Josef Perktold. "statsmodels: Econometric
and statistical modeling with python." Proceedings of the 9th Python
in Science Conference. 2010.
.. [2] Durbin, James, and Siem Jan Koopman. 2012.
Time Series Analysis by State Space Methods: Second Edition.
Oxford University Press.
Examples
--------
>>> from sktime.datasets import load_airline
>>> from sktime.forecasting.structural import UnobservedComponents
>>> y = load_airline()
>>> forecaster = UnobservedComponents(level='local linear trend') # doctest: +SKIP
>>> forecaster.fit(y) # doctest: +SKIP
UnobservedComponents(...)
>>> y_pred = forecaster.predict(fh=[1, 2, 3]) # doctest: +SKIP
"""
_tags = {
# packaging info
# --------------
"authors": ["juanitorduz"],
"maintainers": ["juanitorduz"],
# python_dependencies: "statsmodels" - inherited from _StatsModelsAdapter
# estimator type
# --------------
"capability:pred_int": True,
"capability:pred_int:insample": True,
"handles-missing-data": False,
"ignores-exogeneous-X": False,
}
def __init__(
self,
level=False,
trend=False,
seasonal=None,
freq_seasonal=None,
cycle=False,
autoregressive=None,
irregular=False,
stochastic_level=False,
stochastic_trend=False,
stochastic_seasonal=True,
stochastic_freq_seasonal=None,
stochastic_cycle=False,
damped_cycle=False,
cycle_period_bounds=None,
mle_regression=True,
use_exact_diffuse=False,
start_params=None,
transformed=True,
includes_fixed=False,
cov_type=None,
cov_kwds=None,
method="lbfgs",
maxiter=50,
full_output=1,
disp=0,
callback=None,
return_params=False,
optim_score=None,
optim_complex_step=None,
optim_hessian=None,
flags=None,
low_memory=False,
random_state=None,
):
# Model params
self.level = level
self.trend = trend
self.seasonal = seasonal
self.freq_seasonal = freq_seasonal
self.cycle = cycle
self.autoregressive = autoregressive
self.irregular = irregular
self.stochastic_level = stochastic_level
self.stochastic_trend = stochastic_trend
self.stochastic_seasonal = stochastic_seasonal
self.stochastic_freq_seasonal = stochastic_freq_seasonal
self.stochastic_cycle = stochastic_cycle
self.damped_cycle = damped_cycle
self.cycle_period_bounds = cycle_period_bounds
self.mle_regression = mle_regression
self.use_exact_diffuse = use_exact_diffuse
# Fit params
self.start_params = start_params
self.transformed = transformed
self.includes_fixed = includes_fixed
self.cov_type = cov_type
self.cov_kwds = cov_kwds
self.method = method
self.maxiter = maxiter
self.full_output = full_output
self.disp = disp
self.callback = callback
self.return_params = return_params
self.optim_score = optim_score
self.optim_complex_step = optim_complex_step
self.optim_hessian = optim_hessian
self.flags = flags
self.low_memory = low_memory
super().__init__(random_state=random_state)
def _fit_forecaster(self, y, X=None):
"""Fit to training data.
Parameters
----------
y : pd.Series
Target time series to which to fit the forecaster.
X : pd.DataFrame, optional (default=None)
Exogenous variables.
"""
from statsmodels.tsa.statespace.structural import (
UnobservedComponents as _UnobservedComponents,
)
self._forecaster = _UnobservedComponents(
endog=y,
exog=X,
level=self.level,
trend=self.trend,
seasonal=self.seasonal,
freq_seasonal=self.freq_seasonal,
cycle=self.cycle,
autoregressive=self.autoregressive,
irregular=self.irregular,
stochastic_level=self.stochastic_level,
stochastic_trend=self.stochastic_trend,
stochastic_seasonal=self.stochastic_seasonal,
stochastic_freq_seasonal=self.stochastic_freq_seasonal,
stochastic_cycle=self.stochastic_cycle,
damped_cycle=self.damped_cycle,
cycle_period_bounds=self.cycle_period_bounds,
mle_regression=self.mle_regression,
use_exact_diffuse=self.use_exact_diffuse,
)
self._fitted_forecaster = self._forecaster.fit(
start_params=self.start_params,
transformed=self.transformed,
includes_fixed=self.includes_fixed,
cov_type=self.cov_type,
cov_kwds=self.cov_kwds,
method=self.method,
maxiter=self.maxiter,
full_output=self.full_output,
disp=self.disp,
callback=self.callback,
return_params=self.return_params,
optim_score=self.optim_score,
optim_complex_step=self.optim_complex_step,
optim_hessian=self.optim_hessian,
flags=self.flags,
low_memory=self.low_memory,
)
@staticmethod
def _extract_conf_int(prediction_results, alpha) -> pd.DataFrame:
"""Construct confidence interval at specified ``alpha`` for each timestep.
Parameters
----------
prediction_results : PredictionResults
results class, as returned by ``self._fitted_forecaster.get_prediction``
alpha : float
one minus nominal coverage
Returns
-------
pd.DataFrame
confidence intervals at each timestep
The dataframe must have at least two columns ``lower`` and ``upper``, and
the row indices must be integers relative to ``self.cutoff``. Order of
columns do not matter, and row indices must be a superset of relative
integer horizon of ``fh``.
"""
conf_int = prediction_results.conf_int(alpha=alpha)
conf_int.columns = ["lower", "upper"]
return conf_int
def summary(self):
"""Get a summary of the fitted forecaster.
This is the same as the implementation in statsmodels:
https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_structural_harvey_jaeger.html
"""
return self._fitted_forecaster.summary()
def simulate(
self,
nsimulations,
X=None,
measurement_shocks=None,
state_shocks=None,
initial_state=None,
anchor=None,
repetitions=None,
**kwargs,
):
r"""Simulate a new time series following the state space model.
Taken from the original statsmodels implementation.
Parameters
----------
nsimulations : int
The number of observations to simulate. If the model is
time-invariant this can be any number. If the model is
time-varying, then this number must be less than or equal to the
number of observations.
X : pd.DataFrame, optional (default=None)
Exogenous variables.
measurement_shocks : array_like, optional
If specified, these are the shocks to the measurement equation,
:math:`\varepsilon_t`. If unspecified, these are automatically
generated using a pseudo-random number generator. If specified,
must be shaped ``nsimulations`` x ``k_endog``, where ``k_endog`` is the
same as in the state space model.
state_shocks : array_like, optional
If specified, these are the shocks to the state equation,
:math:`\eta_t`. If unspecified, these are automatically
generated using a pseudo-random number generator. If specified,
must be shaped ``nsimulations`` x ``k_posdef`` where ``k_posdef`` is the
same as in the state space model.
initial_state : array_like, optional
If specified, this is the initial state vector to use in
simulation, which should be shaped (``k_states`` x 1), where
``k_states`` is the same as in the state space model. If unspecified,
but the model has been initialized, then that initialization is
used. This must be specified if ``anchor`` is anything other than
"start" or 0 (or else you can use the ``simulate`` method on a
results object rather than on the model object).
anchor : int, str, or datetime, optional
First period for simulation. The simulation will be conditional on
all existing datapoints prior to the ``anchor``. Type depends on the
index of the given ``endog`` in the model. Two special cases are the
strings 'start' and 'end'. ``start`` refers to beginning the
simulation at the first period of the sample, and ``end`` refers to
beginning the simulation at the first period after the sample.
Integer values can run from 0 to ``nobs``, or can be negative to
apply negative indexing. Finally, if a date/time index was provided
to the model, then this argument can be a date string to parse or a
datetime type. Default is 'start'.
repetitions : int, optional
Number of simulated paths to generate. Default is 1 simulated path.
See Also
--------
statsmodels.tsa.statespace.mlemodel.MLEResults
Returns
-------
simulated_obs : ndarray
An array of simulated observations. If ``repetitions=None``, then it
will be shaped (nsimulations x k_endog) or (nsimulations,) if
``k_endog=1``. Otherwise it will be shaped
(nsimulations x k_endog x repetitions). If the model was given
Pandas input then the output will be a Pandas object. If
``k_endog > 1`` and ``repetitions`` is not None, then the output will
be a Pandas DataFrame that has a MultiIndex for the columns, with
the first level containing the names of the ``endog`` variables and
the second level containing the repetition number.
"""
return self._fitted_forecaster.simulate(
nsimulations=nsimulations,
measurement_shocks=measurement_shocks,
state_shocks=state_shocks,
initial_state=initial_state,
anchor=anchor,
repetitions=repetitions,
exog=X,
**kwargs,
)
def plot_diagnostics(
self,
variable=0,
lags=10,
fig=None,
figsize=None,
truncate_endog_names=24,
):
"""Diagnostic plots for standardized residuals.
Taken from the original statsmodels implementation.
Parameters
----------
variable : int, optional
Index of the endogenous variable for which the diagnostic plots
should be created. Default is 0.
lags : int, optional
Number of lags to include in the correlogram. Default is 10.
fig : Figure, optional
If given, subplots are created in this figure instead of in a new
figure. Note that the 2x2 grid will be created in the provided
figure using ``fig.add_subplot()``.
figsize : tuple, optional
If a figure is created, this argument allows specifying a size.
The tuple is (width, height).
See Also
--------
statsmodels.tsa.statespace.mlemodel.MLEResults
Returns
-------
Figure
Figure instance with diagnostic plots.
"""
self._fitted_forecaster.plot_diagnostics(
variable=variable,
lags=lags,
fig=fig,
figsize=figsize,
truncate_endog_names=truncate_endog_names,
)
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.
Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return ``"default"`` set.
Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
instance.
``create_test_instance`` uses the first (or only) dictionary in ``params``
"""
return {"level": "local level"}