-
Notifications
You must be signed in to change notification settings - Fork 91
/
_datetime_encoder.py
325 lines (267 loc) · 11.1 KB
/
_datetime_encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
from datetime import datetime, timezone
import pandas as pd
from sklearn.utils.validation import check_is_fitted
try:
import polars as pl
except ImportError:
pass
from . import _dataframe as sbd
from ._dispatch import dispatch
from ._on_each_column import RejectColumn, SingleColumnTransformer
__all__ = ["DatetimeEncoder"]
_TIME_LEVELS = [
"year",
"month",
"day",
"hour",
"minute",
"second",
"microsecond",
"nanosecond",
]
@dispatch
def _is_date(col):
raise NotImplementedError()
@_is_date.specialize("pandas", argument_type="Column")
def _is_date_pandas(col):
col = sbd.drop_nulls(col)
return (col.dt.date == col).all()
@_is_date.specialize("polars", argument_type="Column")
def _is_date_polars(col):
return (col.dt.date() == col).all()
@dispatch
def _get_dt_feature(col, feature):
raise NotImplementedError()
@_get_dt_feature.specialize("pandas", argument_type="Column")
def _get_dt_feature_pandas(col, feature):
if feature == "total_seconds":
if col.dt.tz is None:
epoch = datetime(1970, 1, 1)
else:
epoch = datetime(1970, 1, 1, tzinfo=timezone.utc)
return ((col - epoch) / pd.Timedelta("1s")).astype("float32")
if feature == "weekday":
return col.dt.day_of_week + 1
assert feature in _TIME_LEVELS
return getattr(col.dt, feature)
@_get_dt_feature.specialize("polars", argument_type="Column")
def _get_dt_feature_polars(col, feature):
if feature == "total_seconds":
return (col.dt.timestamp(time_unit="ms") / 1000).cast(pl.Float32)
assert feature in _TIME_LEVELS + ["weekday"]
return getattr(col.dt, feature)()
class DatetimeEncoder(SingleColumnTransformer):
"""
Extract temporal features such as month, day of the week, … from a datetime column.
All extracted features are provided as float32 columns.
No timezone conversion is performed: if the input column is timezone aware, the
extracted features will be in the column's timezone.
An input column that does not have a Date or Datetime dtype will be
rejected by raising a ``RejectColumn`` exception. See ``ToDatetime`` for
converting strings to proper datetimes. **Note:** the ``TableVectorizer``
only sends datetime columns to its ``datetime_encoder``. Therefore it is
always safe to use a ``DatetimeEncoder`` as the ``TableVectorizer``'s
``datetime_encoder`` parameter.
Parameters
----------
resolution : str or None, default="hour"
If a string, extract up to this resolution. Must be "year", "month",
"day", "hour", "minute", "second", "microsecond", or "nanosecond". For
example, ``resolution="day"`` generates the features "year", "month",
and "day" only. If the input column contains dates with no time
information, time features ("hour", "minute", … ) are never extracted.
If ``None``, the features listed above are not extracted (but day of
the week and total seconds may still be extracted, see below).
add_weekday : bool, default=False
Extract the day of the week as a numerical feature from 1 (Monday) to 7
(Sunday).
add_total_seconds : bool, default=True
Add the total number of seconds since the Unix epoch (00:00:00 UTC on 1
January 1970).
Attributes
----------
extracted_features_ : list of strings
The features that are extracted, a subset of ["year", …, "nanosecond",
"weekday", "total_seconds"]
See Also
--------
ToDatetime :
Convert strings to datetimes.
Examples
--------
>>> import pandas as pd
>>> login = pd.to_datetime(
... pd.Series(
... ["2024-05-13T12:05:36", None, "2024-05-15T13:46:02"], name="login")
... )
>>> login
0 2024-05-13 12:05:36
1 NaT
2 2024-05-15 13:46:02
Name: login, dtype: datetime64[ns]
>>> from skrub import DatetimeEncoder
>>> DatetimeEncoder().fit_transform(login)
login_year login_month login_day login_hour login_total_seconds
0 2024.0 5.0 13.0 12.0 1.715602e+09
1 NaN NaN NaN NaN NaN
2 2024.0 5.0 15.0 13.0 1.715781e+09
We can ask for a finer resolution:
>>> DatetimeEncoder(resolution='second', add_total_seconds=False).fit_transform(
... login
... )
login_year login_month login_day login_hour login_minute login_second
0 2024.0 5.0 13.0 12.0 5.0 36.0
1 NaN NaN NaN NaN NaN NaN
2 2024.0 5.0 15.0 13.0 46.0 2.0
We can also ask for the day of the week. The week starts at 1 on Monday and ends
at 7 on Sunday. This is consistent with the ISO week date system
(https://en.wikipedia.org/wiki/ISO_week_date), the standard library
``datetime.isoweekday()`` and polars ``weekday``, but not with pandas
``day_of_week``, which counts days from 0.
>>> login.dt.strftime('%A = %w')
0 Monday = 1
1 NaN
2 Wednesday = 3
Name: login, dtype: object
>>> login.dt.day_of_week
0 0.0
1 NaN
2 2.0
Name: login, dtype: float64
>>> DatetimeEncoder(add_weekday=True, add_total_seconds=False).fit_transform(login)
login_year login_month login_day login_hour login_weekday
0 2024.0 5.0 13.0 12.0 1.0
1 NaN NaN NaN NaN NaN
2 2024.0 5.0 15.0 13.0 3.0
When a column contains only dates without time information, the time features
are discarded, regardless of ``resolution``.
>>> birthday = pd.to_datetime(
... pd.Series(['2024-04-14', '2024-05-15'], name='birthday')
... )
>>> encoder = DatetimeEncoder(resolution='second')
>>> encoder.fit_transform(birthday)
birthday_year birthday_month birthday_day birthday_total_seconds
0 2024.0 4.0 14.0 1.713053e+09
1 2024.0 5.0 15.0 1.715731e+09
>>> encoder.extracted_features_
['year', 'month', 'day', 'total_seconds']
(The number of seconds since Epoch can still be extracted but not "hour",
"minute", etc.)
Non-datetime columns are rejected by raising a ``RejectColumn`` exception.
>>> s = pd.Series(['2024-04-14', '2024-05-15'], name='birthday')
>>> s
0 2024-04-14
1 2024-05-15
Name: birthday, dtype: object
>>> DatetimeEncoder().fit_transform(s)
Traceback (most recent call last):
...
skrub._on_each_column.RejectColumn: Column 'birthday' does not have Date or Datetime dtype.
:class:`ToDatetime`: can be used for converting strings to datetimes.
>>> from skrub import ToDatetime
>>> from sklearn.pipeline import make_pipeline
>>> make_pipeline(ToDatetime(), DatetimeEncoder()).fit_transform(s)
birthday_year birthday_month birthday_day birthday_total_seconds
0 2024.0 4.0 14.0 1.713053e+09
1 2024.0 5.0 15.0 1.715731e+09
**Time zones**
If the input column has a time zone, the extracted features are in this timezone.
>>> login = pd.to_datetime(
... pd.Series(
... ["2024-05-13T12:05:36", None, "2024-05-15T13:46:02"], name="login")
... ).dt.tz_localize('Europe/Paris')
>>> encoder = DatetimeEncoder()
>>> encoder.fit_transform(login)['login_hour']
0 12.0
1 NaN
2 13.0
Name: login_hour, dtype: float32
No special care is taken to convert inputs to ``transform`` to the same time
zone as the column the encoder was fitted on. The features are always in the
time zone of the input.
>>> login_sp = login.dt.tz_convert('America/Sao_Paulo')
>>> login_sp
0 2024-05-13 07:05:36-03:00
1 NaT
2 2024-05-15 08:46:02-03:00
Name: login, dtype: datetime64[ns, America/Sao_Paulo]
>>> encoder.transform(login_sp)['login_hour']
0 7.0
1 NaN
2 8.0
Name: login_hour, dtype: float32
To ensure datetime columns are in a consistent timezones, use ``ToDatetime``.
>>> encoder = make_pipeline(ToDatetime(), DatetimeEncoder())
>>> encoder.fit_transform(login)['login_hour']
0 12.0
1 NaN
2 13.0
Name: login_hour, dtype: float32
>>> encoder.transform(login_sp)['login_hour']
0 12.0
1 NaN
2 13.0
Name: login_hour, dtype: float32
Here we can see the input to ``transform`` has been converted back to the
timezone used during ``fit`` and that we get the same result for "hour".
""" # noqa: E501
def __init__(self, resolution="hour", add_weekday=False, add_total_seconds=True):
self.resolution = resolution
self.add_weekday = add_weekday
self.add_total_seconds = add_total_seconds
def fit_transform(self, column, y=None):
"""Fit the encoder and transform a column.
Parameters
----------
column : pandas or polars Series with dtype Date or Datetime
The input to transform.
y : None
Ignored.
Returns
-------
transformed : DataFrame
The extracted features.
"""
del y
self._check_params()
if not sbd.is_any_date(column):
raise RejectColumn(
f"Column {sbd.name(column)!r} does not have Date or Datetime dtype."
)
if self.resolution is None:
self.extracted_features_ = []
else:
idx_level = _TIME_LEVELS.index(self.resolution)
if _is_date(column):
idx_level = min(idx_level, _TIME_LEVELS.index("day"))
self.extracted_features_ = _TIME_LEVELS[: idx_level + 1]
if self.add_total_seconds:
self.extracted_features_.append("total_seconds")
if self.add_weekday:
self.extracted_features_.append("weekday")
return self.transform(column)
def transform(self, column):
"""Transform a column.
Parameters
----------
column : pandas or polars Series with dtype Date or Datetime
The input to transform.
Returns
-------
transformed : DataFrame
The extracted features.
"""
check_is_fitted(self, "extracted_features_")
name = sbd.name(column)
all_extracted = []
for feature in self.extracted_features_:
extracted = _get_dt_feature(column, feature).rename(f"{name}_{feature}")
extracted = sbd.to_float32(extracted)
all_extracted.append(extracted)
return sbd.make_dataframe_like(column, all_extracted)
def _check_params(self):
allowed = _TIME_LEVELS + [None]
if self.resolution not in allowed:
raise ValueError(
f"'resolution' options are {allowed}, got {self.resolution!r}."
)