# New Implementation of DataFrameEncoder


This time, this class will not be responsible for any splitting purposes.
Instead, it will purely act as a column-wise / group-wise transformation.

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import logging
from collections.abc import Collection, Hashable, Iterable, Mapping
from typing import Optional, Union

import numpy as np
import pandas as pd
import pandas.api.types
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.core.indexes.frozen import FrozenList

from tsdm.encoders import *
from tsdm.encoders import BaseEncoder

np.set_printoptions(precision=4, floatmode="fixed", suppress=True)
rng = np.random.default_rng()

In [None]:
class FrameEncoder(BaseEncoder):
    KEYS = Hashable
    columns: Index
    dtypes: Series
    index_columns: Index
    index_dtypes: Series

    column_encoders: Optional[Mapping[tuple[KEYS, ...], BaseEncoder]]
    r"""Encoders for the columns."""
    index_encoders: Optional[Mapping[tuple[KEYS, ...], BaseEncoder]]
    r"""Optional Encoder for the index."""
    column_decoders: Optional[Mapping[tuple[KEYS, ...], BaseEncoder]]
    r"""Reverse Dictionary from encoded column name -> encoder"""
    index_decoders: Optional[Mapping[tuple[KEYS, ...], BaseEncoder]]
    r"""Reverse Dictionary from encoded index name -> encoder"""

    @staticmethod
    def _names(obj) -> Union[str, Iterable[str]]:
        if isinstance(obj, MultiIndex):
            return FrozenList(obj.names)
        if isinstance(obj, (Series, Index)):
            return obj.name
        if isinstance(obj, DataFrame):
            return FrozenList(obj.columns)
        raise ValueError

    def __init__(
        self,
        column_encoders: Optional[
            Union[BaseEncoder, Mapping[Union[KEYS, Collection[KEYS]], BaseEncoder]]
        ] = None,
        *,
        index_encoders: Optional[
            Union[BaseEncoder, Mapping[Union[KEYS, Collection[KEYS]], BaseEncoder]]
        ] = None,
    ):
        super().__init__()
        self.column_encoders = column_encoders
        self.index_encoders = index_encoders

    def fit(self, data: DataFrame) -> None:
        data = data.copy()
        index = data.index.to_frame()
        self.columns = data.columns
        self.dtypes = data.dtypes
        self.index_columns = index.columns
        self.index_dtypes = index.dtypes

        if self.column_encoders is None:
            self.column_decoders = None
        elif isinstance(self.column_encoders, BaseEncoder):
            self.column_encoders.fit(data)
            self.column_decoders = self.column_encoders
        else:
            self.column_decoders = {}
            for group, encoder in self.column_encoders.items():
                encoder.fit(data[group])
                encoded = encoder.encode(data[group])
                self.column_decoders[self._names(encoded)] = encoder

        if self.index_encoders is None:
            self.index_decoders = None
        elif isinstance(self.index_encoders, BaseEncoder):
            self.index_encoders.fit(index)
            self.index_decoders = self.index_encoders
        else:
            self.index_decoders = {}
            for group, encoder in self.index_encoders.items():
                encoder.fit(index[group])
                encoded = encoder.encode(index[group])
                self.index_decoders[self._names(encoded)] = encoder

    def encode(self, data: DataFrame) -> DataFrame:
        data = data.copy(deep=True)
        index = data.index.to_frame()
        encoded_cols = data
        encoded_inds = encoded_cols.index.to_frame()

        if self.column_encoders is None:
            pass
        elif isinstance(self.column_encoders, BaseEncoder):
            encoded = self.column_encoders.encode(data)
            encoded_cols = encoded_cols.drop(columns=data.columns)
            encoded_cols[self._names(encoded)] = encoded
        else:
            for group, encoder in self.column_encoders.items():
                encoded = encoder.encode(data[group])
                encoded_cols = encoded_cols.drop(columns=group)
                encoded_cols[self._names(encoded)] = encoded

        if self.index_encoders is None:
            pass
        elif isinstance(self.index_encoders, BaseEncoder):
            encoded = self.index_encoders.encode(index)
            encoded_inds = encoded_inds.drop(columns=index.columns)
            encoded_inds[self._names(encoded)] = encoded
        else:
            for group, encoder in self.index_encoders.items():
                encoded = encoder.encode(index[group])
                encoded_inds = encoded_inds.drop(columns=group)
                encoded_inds[self._names(encoded)] = encoded

        # Assemble DataFrame
        encoded = DataFrame(encoded_cols)
        encoded[self._names(encoded_inds)] = encoded_inds
        encoded = encoded.set_index(self._names(encoded_inds))
        return encoded

    def decode(self, data: DataFrame) -> DataFrame:
        data = data.copy(deep=True)
        index = data.index.to_frame()
        decoded_cols = data
        decoded_inds = decoded_cols.index.to_frame()

        if self.column_decoders is None:
            pass
        elif isinstance(self.column_decoders, BaseEncoder):
            decoded = self.column_decoders.decode(data)
            decoded_cols = decoded_cols.drop(columns=data.columns)
            decoded_cols[self._names(decoded)] = decoded
        else:
            for group, encoder in self.column_decoders.items():
                decoded = encoder.decode(data[group])
                decoded_cols = decoded_cols.drop(columns=group)
                decoded_cols[self._names(decoded)] = decoded

        if self.index_decoders is None:
            pass
        elif isinstance(self.index_decoders, BaseEncoder):
            decoded = self.index_decoders.decode(index)
            decoded_inds = decoded_inds.drop(columns=index.columns)
            decoded_inds[self._names(decoded)] = decoded
        else:
            for group, encoder in self.index_decoders.items():
                decoded = encoder.decode(index[group])
                decoded_inds = decoded_inds.drop(columns=group)
                decoded_inds[self._names(decoded)] = decoded

        # Restore index order + dtypes
        decoded_inds = decoded_inds[self.index_columns]
        decoded_inds = decoded_inds.astype(self.index_dtypes)

        # Assemble DataFrame
        decoded = DataFrame(decoded_cols)
        decoded[self._names(decoded_inds)] = decoded_inds
        decoded = decoded.set_index(self._names(decoded_inds))
        decoded = decoded[self.columns]
        decoded = decoded.astype(self.dtypes)

        return decoded

In [None]:
from tsdm.tasks import KIWI_FINAL_PRODUCT

task = KIWI_FINAL_PRODUCT()
ts = task.timeseries.sort_index(axis="index").sort_index(axis="columns")
channel_freq = pd.notna(ts).mean().sort_values()
fast_channels = FrozenList(channel_freq[channel_freq >= 0.1].index)
slow_channels = FrozenList(channel_freq[channel_freq < 0.1].index)
FAST = ts[fast_channels].dropna(how="all")
SLOW = ts[slow_channels].dropna(how="all")
groups = {"fast": fast_channels, "slow": slow_channels}

In [None]:
from tsdm.encoders import *

enc = FrameEncoder(
    column_encoders={
        fast_channels: Standardizer(),
        slow_channels: MinMaxScaler(),
    },
    index_encoders={
        "run_id": IntEncoder(),
        "experiment_id": IntEncoder(),
        "measurement_time": DateTimeEncoder(),
    },
)
enc.fit(ts)

In [None]:
encoded = enc.encode(ts)
decoded = enc.decode(encoded)
pd.testing.assert_frame_equal(ts, decoded)

In [None]:
encoded

In [None]:
T = ts[[]].reset_index(-1)["measurement_time"]

In [None]:
e = DateTimeEncoder()
e.fit(T)
e.decode(e.encode(T))

In [None]:
mask = pd.notna(ts.Acetate)
ts.Acetate[mask]

In [None]:
from types import MethodType

ts.to_frame = MethodType(lambda self: self, ts)

In [None]:
%%timeit
ts.to_frame()

In [None]:
e = MinMaxScaler()

e.fit(ts[fast_channels])

In [None]:
e.xmin.shape

In [None]:
e.ymin.ndim

In [None]:
e[3:6].xmin

In [None]:
e.encode(ts[fast_channels])