# Datetime preprocessors

> Appropiate handling of datetime features.

In [None]:
# | default_exp preprocessing.datetime

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export

from __future__ import annotations
from typing import Sequence, Optional, Union, List
from enum import Enum

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# | exports


class DateLevel(Enum):
    """An enum representing different date levels."""

    YEAR = "year"
    QUARTER = "quarter"
    MONTH = "month"
    DAY = "day"
    HOUR = "hour"
    MINUTE = "minute"
    SECOND = "second"
    MICROSECOND = "microsecond"
    NANOSECOND = "nanosecond"
    WEEKDAY = "weekday"
    DAYOFYEAR = "dayofyear"
    DAYSINMONTH = "daysinmonth"

In [None]:
# | export


class DatetimeEncoder(BaseEstimator, TransformerMixin):
    """An encoder for datetime columns that outputs integer features

    `levels` is a list of `DateLevel` that define which date features to extract, i.e,
    [`DateLevel.HOUR`, `DateLevel.MINUTE`] will extract hours and minutes. If left to the
    default `None`, all available features will be extracted initially, but zero variance
    features will be dropped (for example, because the dates don't have seconds).

    Parameters
    ----------
    levels :
        Date features to extract.
    fmt :
        Date format for string conversion if inputs are note datetime-like objects.
        Follows standard Pandas/stdlib formatting, or example, '%Y-%m-%d %H:%M:%S'.
    """

    def __init__(
        self, levels: Optional[Sequence[DateLevel]] = None, fmt: Optional[str] = None
    ):

        self.levels = levels
        self.fmt = fmt

    def _more_tags(self):
        return {
            "X_types": ["2darray", "string"],
            "preserves_dtype": [],
            "allow_nan": True,
        }

    def fit(self, X: Union[pd.DataFrame, np.ndarray, List], y=None) -> DatetimeEncoder:
        """Fit the DatetimeEncoder.

        Parameters
        ----------
        X :
            Datetime-like features..
        y :
            Unused.

        Returns
        -------
        DatetimeEncoder
            Fitted `DatetimeEncoder`.
        """
        if isinstance(X, pd.DataFrame):
            if X.dtypes.nunique() > 1 and not all(
                pd.api.types.is_datetime64_any_dtype(dt) for dt in X.dtypes
            ):
                raise ValueError(
                    "If data contains more than one type, they all have to be datetime64 (any)."
                )
            elif X.dtypes[0] in (object, str):
                X = X.apply(pd.to_datetime, format=self.fmt)
            self.colnames_ = X.columns
        X = self._validate_data(X=X, y=None, force_all_finite="allow-nan")

        self.valid_features_ = {}
        if self.levels:
            levels = self.levels
        else:
            levels = list(DateLevel)
        for col in range(X.shape[1]):
            valid_single_feature = []
            for level in levels:
                dates = pd.DatetimeIndex(X[:, col])
                if dates.tz:
                    dates = dates.tz_convert(None)
                encoded = getattr(dates, level.value)
                if encoded.nunique() > 1:
                    valid_single_feature.append(level)
            self.valid_features_.update({col: valid_single_feature})

        self.n_features_in_ = X.shape[1]
        self.n_features_out_ = sum(
            [len(features) for features in self.valid_features_.values()]
        )
        return self

    def transform(self, X: Union[pd.DataFrame, np.ndarray, List]) -> np.ndarray:
        """Apply transformation. Will ignore zero variance features seen during `DatetimeEncoder.fit`.

        While this transformer is generally stateless, during `DatetimeEncoder.fit` it checks whether any of
        the extracted features have zero variance (only one unique value) and sets those levels to be
        ignored during `DatetimeEncoder.transform`.

        Parameters
        ----------
        X :
            The data to encode.

        Returns
        -------
        X :
            Transformed input.
        """
        if isinstance(X, pd.DataFrame):
            if X.dtypes.nunique() > 1 and not all(
                pd.api.types.is_datetime64_any_dtype(dt) for dt in X.dtypes
            ):
                raise ValueError(
                    "If data contains more than one type, they all have to be datetime64 (any)."
                )
            elif X.dtypes[0] in (object, str):
                X = X.apply(pd.to_datetime, format=self.fmt)
        X = self._validate_data(X=X, y=None, force_all_finite="allow-nan")

        all_encoded = []
        for col, levels in self.valid_features_.items():
            for level in levels:
                dates = pd.DatetimeIndex(X[:, col])
                if dates.tz:
                    dates = dates.tz_convert(None)
                encoded = getattr(dates, level.value)
                all_encoded.append(encoded)
        output = np.stack(all_encoded, axis=1)
        return output

    def get_feature_names_out(self, input_features=None) -> List[str]:
        """Get feature names for output."""
        feature_names = []
        colnames_ = getattr(self, "colnames_", None)
        for i in self.valid_features_.keys():
            prefix = str(i) if colnames_ is None else colnames_[i]
            for feature in self.valid_features_[i]:
                feature_names.append(f"{prefix}_{feature.value}")
        return feature_names

    def get_feature_names(self, input_features=None) -> List[str]:
        return self.get_feature_names_out()

In [None]:
show_doc(DatetimeEncoder.fit)

---

[source](https://github.com/rxavier/poniard/blob/master/poniard/preprocessing/datetime.py#L64){target="_blank" style="float:right; font-size:smaller"}

### DatetimeEncoder.fit

>      DatetimeEncoder.fit
>                           (X:Union[pandas.core.frame.DataFrame,numpy.ndarray,L
>                           ist], y=None)

Fit the DatetimeEncoder.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X | Union[pd.DataFrame, np.ndarray, List] |  | Datetime-like features.. |
| y | NoneType | None | Unused. |
| **Returns** | **DatetimeEncoder** |  | **Fitted `DatetimeEncoder`.** |

After fitting, the categories of each feature are held in the `categories_` attribute.

In [None]:
show_doc(DatetimeEncoder.transform)

---

[source](https://github.com/rxavier/poniard/blob/master/poniard/preprocessing/datetime.py#L113){target="_blank" style="float:right; font-size:smaller"}

### DatetimeEncoder.transform

>      DatetimeEncoder.transform
>                                 (X:Union[pandas.core.frame.DataFrame,numpy.nda
>                                 rray,List])

Apply transformation. Will ignore zero variance features seen during `DatetimeEncoder.fit`.

While this transformer is generally stateless, during `DatetimeEncoder.fit` it checks whether any of
the extracted features have zero variance (only one unique value) and sets those levels to be
ignored during `DatetimeEncoder.transform`.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| X | Union[pd.DataFrame, np.ndarray, List] | The data to encode. |
| **Returns** | **np.ndarray** | **Transformed input.** |

In [None]:
show_doc(DatetimeEncoder.fit_transform)

---

### TransformerMixin.fit_transform

>      TransformerMixin.fit_transform (X, y=None, **fit_params)

Fit to data, then transform it.

Fits transformer to `X` and `y` with optional parameters `fit_params`
and returns a transformed version of `X`.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| X | array-like of shape (n_samples, n_features) |  | Input samples. |
| y | NoneType | None | Target values (None for unsupervised transformations). |
| fit_params |  |  |  |
| **Returns** | **ndarray array of shape (n_samples, n_features_new)** |  | **Transformed array.** |

In [None]:
import pandas as pd

In [None]:
X = pd.DataFrame(
    {
        "hours": pd.date_range(start="2022-01-01", freq="H", periods=25),
        "days": pd.date_range(start="2022-01-01", freq="D", periods=25),
    }
)

encoder = DatetimeEncoder()
pd.DataFrame(encoder.fit_transform(X), columns=encoder.get_feature_names_out()).head()

Unnamed: 0,hours_day,hours_hour,hours_weekday,hours_dayofyear,days_day,days_weekday,days_dayofyear
0,1,0,5,1,1,5,1
1,1,1,5,1,2,6,2
2,1,2,5,1,3,0,3
3,1,3,5,1,4,1,4
4,1,4,5,1,5,2,5


Dates can be strings as well, but datetimes and strings cannot be combined.

In [None]:
date_format = "%Y-%m-%d"
X = pd.DataFrame(
    {
        "days": pd.date_range(start="2022-01-01", freq="D", periods=25).strftime(
            date_format
        ),
        "quarters": pd.date_range(start="2023-01-01", freq="Q", periods=25).strftime(
            date_format
        ),
    }
)

encoder = DatetimeEncoder(fmt=date_format)
pd.DataFrame(encoder.fit_transform(X), columns=encoder.get_feature_names_out()).head()

Unnamed: 0,days_day,days_weekday,days_dayofyear,quarters_year,quarters_quarter,quarters_month,quarters_day,quarters_weekday,quarters_dayofyear,quarters_daysinmonth
0,1,5,1,2023,1,3,31,4,90,31
1,2,6,2,2023,2,6,30,4,181,30
2,3,0,3,2023,3,9,30,5,273,30
3,4,1,4,2023,4,12,31,6,365,31
4,5,2,5,2024,1,3,31,6,91,31


Date levels may be chosen.

In [None]:
encoder = DatetimeEncoder(
    levels=[DateLevel.DAY, DateLevel.HOUR, DateLevel.MONTH], fmt=date_format
)
pd.DataFrame(encoder.fit_transform(X), columns=encoder.get_feature_names_out()).head()

Unnamed: 0,days_day,quarters_day,quarters_month
0,1,31,3
1,2,30,6
2,3,30,9
3,4,31,12
4,5,31,3


In [None]:
# | hide
import nbdev

nbdev.nbdev_export()