/
tsa_model.py
273 lines (234 loc) · 10.1 KB
/
tsa_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import statsmodels.base.model as base
from statsmodels.base import data
import statsmodels.base.wrapper as wrap
from statsmodels.tsa.base import datetools
from numpy import arange, asarray
from pandas import Index
from pandas import datetools as pandas_datetools
import datetime
try:
from pandas import Period
except ImportError: # not sure what version this was added
Period = datetime.datetime # HACK
_freq_to_pandas = datetools._freq_to_pandas
_tsa_doc = """
%(model)s
Parameters
----------
%(params)s
dates : array-like of datetime, optional
An array-like object of datetime objects. If a pandas object is given
for endog or exog, it is assumed to have a DateIndex.
freq : str, optional
The frequency of the time-series. A Pandas offset or 'B', 'D', 'W',
'M', 'A', or 'Q'. This is optional if dates are given.
%(extra_params)s
%(extra_sections)s
"""
_model_doc = "Timeseries model base class"
_generic_params = base._model_params_doc
_missing_param_doc = base._missing_param_doc
class TimeSeriesModel(base.LikelihoodModel):
__doc__ = _tsa_doc % {"model" : _model_doc, "params" : _generic_params,
"extra_params" : _missing_param_doc,
"extra_sections" : ""}
def __init__(self, endog, exog=None, dates=None, freq=None, missing='none'):
super(TimeSeriesModel, self).__init__(endog, exog, missing=missing)
self._init_dates(dates, freq)
def _init_dates(self, dates, freq):
if dates is None:
dates = self.data.row_labels
if dates is not None:
if (not isinstance(dates[0], (datetime.datetime, Period)) and
isinstance(self.data, data.PandasData)):
raise ValueError("Given a pandas object and the index does "
"not contain dates")
if not freq:
try:
freq = datetools._infer_freq(dates)
except:
raise ValueError("Frequency inference failed. Use `freq` "
"keyword.")
dates = Index(dates)
self.data.dates = dates
if freq:
try: #NOTE: Can drop this once we move to pandas >= 0.8.x
_freq_to_pandas[freq]
except:
raise ValueError("freq %s not understood" % freq)
self.data.freq = freq
def _get_exog_names(self):
return self.data.xnames
def _set_exog_names(self, vals):
if not isinstance(vals, list):
vals = [vals]
self.data.xnames = vals
#overwrite with writable property for (V)AR models
exog_names = property(_get_exog_names, _set_exog_names)
def _get_dates_loc(self, dates, date):
if hasattr(dates, 'indexMap'): # 0.7.x
date = dates.indexMap[date]
else:
date = dates.get_loc(date)
try: # pandas 0.8.0 returns a boolean array
len(date)
from numpy import where
date = where(date)[0].item()
except TypeError: # this is expected behavior
pass
return date
def _str_to_date(self, date):
"""
Takes a string and returns a datetime object
"""
return datetools.date_parser(date)
def _set_predict_start_date(self, start):
dates = self.data.dates
if dates is None:
return
if start > len(dates):
raise ValueError("Start must be <= len(endog)")
if start == len(dates):
self.data.predict_start = datetools._date_from_idx(dates[-1],
1, self.data.freq)
elif start < len(dates):
self.data.predict_start = dates[start]
else:
raise ValueError("Start must be <= len(dates)")
def _get_predict_start(self, start):
"""
Returns the index of the given start date. Subclasses should define
default behavior for start = None. That isn't handled here.
Start can be a string or an integer if self.data.dates is None.
"""
dates = self.data.dates
if isinstance(start, str):
if dates is None:
raise ValueError("Got a string for start and dates is None")
dtstart = self._str_to_date(start)
self.data.predict_start = dtstart
try:
start = self._get_dates_loc(dates, dtstart)
except KeyError:
raise ValueError("Start must be in dates. Got %s | %s" %
(str(start), str(dtstart)))
self._set_predict_start_date(start)
return start
def _get_predict_end(self, end):
"""
See _get_predict_start for more information. Subclasses do not
need to define anything for this.
"""
out_of_sample = 0 # will be overwritten if needed
if end is None: # use data for ARIMA - endog changes
end = len(self.data.endog) - 1
dates = self.data.dates
freq = self.data.freq
if isinstance(end, str):
if dates is None:
raise ValueError("Got a string for end and dates is None")
try:
dtend = self._str_to_date(end)
self.data.predict_end = dtend
end = self._get_dates_loc(dates, dtend)
except KeyError, err: # end is greater than dates[-1]...probably
if dtend > self.data.dates[-1]:
end = len(self.data.endog) - 1
freq = self.data.freq
out_of_sample = datetools._idx_from_dates(dates[-1], dtend,
freq)
else:
if freq is None:
raise ValueError("There is no frequency for these "
"dates and date %s is not in dates "
"index. Try giving a date that is in "
"the dates index or use an integer."
% dtend)
else: #pragma: no cover
raise err # should never get here
self._make_predict_dates() # attaches self.data.predict_dates
elif isinstance(end, int) and dates is not None:
try:
self.data.predict_end = dates[end]
except IndexError, err:
nobs = len(self.data.endog) - 1 # as an index
out_of_sample = end - nobs
end = nobs
if freq is not None:
self.data.predict_end = datetools._date_from_idx(dates[-1],
out_of_sample, freq)
elif out_of_sample <= 0: # have no frequency but are in sample
#TODO: what error to catch here to make sure dates is
#on the index?
try:
self.data.predict_end = self._get_dates_loc(dates,
end)
except KeyError:
raise
else:
self.data.predict_end = end + out_of_sample
self.data.predict_start = self._get_dates_loc(dates,
self.data.predict_start)
self._make_predict_dates()
elif isinstance(end, int):
nobs = len(self.data.endog) - 1 # is an index
if end > nobs:
out_of_sample = end - nobs
end = nobs
elif freq is None: # should have a date with freq = None
raise ValueError("When freq is None, you must give an integer "
"index for end.")
return end, out_of_sample
def _make_predict_dates(self):
data = self.data
dtstart = data.predict_start
dtend = data.predict_end
freq = data.freq
if freq is not None:
pandas_freq = _freq_to_pandas[freq]
try:
from pandas import DatetimeIndex
dates = DatetimeIndex(start=dtstart, end=dtend,
freq=pandas_freq)
except ImportError, err:
from pandas import DateRange
dates = DateRange(dtstart, dtend, offset = pandas_freq).values
# handle
elif freq is None and (isinstance(dtstart, int) and
isinstance(dtend, int)):
from pandas import Index
dates = Index(range(dtstart, dtend+1))
# if freq is None and dtstart and dtend aren't integers, we're
# in sample
else:
dates = self.data.dates
start = self._get_dates_loc(dates, dtstart)
end = self._get_dates_loc(dates, dtend)
dates = dates[start:end+1] # is this index inclusive?
self.data.predict_dates = dates
class TimeSeriesModelResults(base.LikelihoodModelResults):
def __init__(self, model, params, normalized_cov_params, scale=1.):
self.data = model.data
super(TimeSeriesModelResults,
self).__init__(model, params, normalized_cov_params, scale)
class TimeSeriesResultsWrapper(wrap.ResultsWrapper):
_attrs = {}
_wrap_attrs = wrap.union_dicts(base.LikelihoodResultsWrapper._wrap_attrs,
_attrs)
_methods = {'predict' : 'dates'}
_wrap_methods = wrap.union_dicts(base.LikelihoodResultsWrapper._wrap_methods,
_methods)
wrap.populate_wrapper(TimeSeriesResultsWrapper,
TimeSeriesModelResults)
if __name__ == "__main__":
import statsmodels.api as sm
import datetime
import pandas
data = sm.datasets.macrodata.load()
#make a DataFrame
#TODO: attach a DataFrame to some of the datasets, for quicker use
dates = [str(int(x[0])) +':'+ str(int(x[1])) \
for x in data.data[['year','quarter']]]
df = pandas.DataFrame(data.data[['realgdp','realinv','realcons']], index=dates)
ex_mod = TimeSeriesModel(df)
#ts_series = pandas.TimeSeries()