# Title

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import defaultdict
from typing import Any, Optional, Union

import numpy as np
import pandas.api.types
import torch
from pandas import NA, DataFrame, Index, MultiIndex, Series

In [None]:
from tsdm.datasets import ETTh1

ds = ETTh1.dataset

## Standardizer

In [None]:
from tsdm.encoders import Standardizer

encoder = Standardizer()
encoder.fit(ds)
encoded = encoder.encode(ds)

In [None]:
decoded = encoder.decode(ds)

## ChainedEncoder

In [None]:
enc = Standardizer() @ Standardizer()

In [None]:
enc.fit(ds)
enc[0].mean, enc[1].mean

In [None]:
encoder = Standardizer()
encoder.fit(ds)
encoded = encoder.encode(ds)

In [None]:
decoded = encoder.decode(ds)

## DataFrameEncoder

In [None]:
DataFrame(ds["OT"]).dtypes

In [None]:
from tsdm.datasets import ETTh1
from tsdm.encoders import DataFrameEncoder, DateTimeEncoder, Standardizer

ds = ETTh1.dataset

encoderA = Standardizer()
encoderB = Standardizer()
encoderC = Standardizer()

encoders = {
    "HUFL": encoderA,
    "HULL": encoderB,
    "MUFL": encoderA,
    "MULL": encoderC,
    "LUFL": encoderB,
    "LULL": encoderB,
    "OT": encoderB,
}

In [None]:
e = DataFrameEncoder(encoders, index_encoder=DateTimeEncoder("h"))

In [None]:
e.fit(ds)
encoded = e.encode(ds)
decoded = e.decode(encoded)
pandas.testing.assert_frame_equal(ds, decoded)

### DataFrameEncoder Test Implementation

In [None]:
from tsdm.encoders import BaseEncoder


class DataFrameEncoder:
    r"""Combine multiple encoders into a single one.

    It is assumed that the DataFrame Modality doesn't change.
    """

    column_encoder: Union[BaseEncoder, dict[Any, BaseEncoder]]
    r"""Encoders for the columns."""
    index_encoder: Optional[BaseEncoder] = None
    r"""Optional Encoder for the index."""
    colspec: list[str] = None
    r"""The columns-specification of the DataFrame."""
    encode_index: bool
    r"""Whether to encode the index."""
    column_wise: bool
    r"""Whether to encode column-wise"""
    partitions: Optional[dict] = None
    r"""Contains partitions if used column wise"""

    def __init__(
        self,
        encoders: Union[BaseEncoder, dict[Any, BaseEncoder]],
        *,
        index_encoder: Optional[BaseEncoder] = None,
    ):
        r"""Set up the individual encoders.

        Note: the same encoder instance can be used for multiple columns.

        Parameters
        ----------
        encoders
        index_encoder
        """
        super().__init__()
        self.column_encoder = encoders
        self.index_encoder = index_encoder
        self.column_wise: bool = isinstance(self.column_encoder, dict)
        self.encode_index: bool = index_encoder is not None

        if self.encode_index:
            _idxenc_spec = {
                "col": NA,
                "encoder": self.index_encoder,
                "dim_in": NA,
                "dim_out": NA,
            }
            idxenc_spec = DataFrame.from_records(
                _idxenc_spec, index=Index([NA], name="partition")
            )
        else:
            idxenc_spec = DataFrame(
                columns=["col", "encoder", "dim_in", "dim_out"],
                index=Index([], name="partition"),
            )

        if not self.column_wise:
            _colenc_spec = {
                "col": NA,
                "encoder": self.column_encoder,
                "dim_in": NA,
                "dim_out": NA,
            }
            colenc_spec = DataFrame.from_records(
                _colenc_spec, index=Index([0], name="partition")
            )
        else:
            keys = self.column_encoder.keys()
            assert len(set(keys)) == len(keys), "Some keys are duplicates!"

            _encoders = tuple(set(self.column_encoder.values()))
            encoders = Series(_encoders, name="encoder")
            partitions = Series(range(len(_encoders)), name="partition")

            _columns = defaultdict(list)
            for key, encoder in self.column_encoder.items():
                _columns[encoder].append(key)

            columns = Series(_columns, name="col")

            colenc_spec = DataFrame(encoders, index=partitions)
            colenc_spec = colenc_spec.join(columns, on="encoder")
            colenc_spec["dim_in"] = colenc_spec["col"].apply(len)
            colenc_spec["dim_out"] = pandas.NA

        self.spec = pandas.concat(
            [idxenc_spec, colenc_spec],
            keys=["index", "columns"],
            names=["section", "partition"],
        )

In [None]:
e = DataFrameEncoder(encoders, index_encoder=Standardizer())
e.spec

In [None]:
e.spec.loc["index", "dim_out"] = 3
torch.randn(3, 4, 5)[..., e.spec.loc["index", "dim_out"].item()].shape

In [None]:
e.spec.loc["index", "dim_out"].item()

In [None]:
torch.Tensor(ds.values).nanmean(axis=None)

In [None]:
torch.mean(torch.tensor([float("nan"), 2]))

In [None]:
np.nanstd

In [None]:
torch.nanmean(torch.Tensor(ds.values), dim=None)

In [None]:
torch.Tensor(ds.values).nanmean(dim=None)

In [None]:
ds["OT"].mean()

In [None]:
indexenc = DataFrame.from_records(
    {"Encoders": object(), "dim_in": 1, "dim_out": 5}, index=Index([0])
)

In [None]:
DataFrame.from_dict({"Encoders": object(), "dim_in": 1, "dim_out": 5}, orient="index").T

In [None]:
pandas.concat(
    [indexenc, e.spec], keys=["index", "columns"], names=["section", "partiton"]
)

In [None]:
idx = MultiIndex(
    levels=[["index", "columns"], []], codes=[[], []], names=["section", "partition"]
)

In [None]:
spec = DataFrame(
    columns=["col", "encoder", "dim_in", "dim_out"],
    index=Index([], name="partition"),
    # index=MultiIndex(
    #     levels=[["index", "columns"], []],
    #     codes=[[], []],
    #     names=["section", "partition"],
    # ),
)

In [None]:
_index_encoder_spec = {
    "col": NA,
    "encoder": object(),
    "dim_in": NA,
    "dim_out": NA,
}
index_encoder_spec = DataFrame.from_records(
    _index_encoder_spec, index=Index([0], name="partition")
)

## Concat Encoder

## Time2Float

## DateTimeEncoder

### Combined

In [None]:
from tsdm.encoders import (
    DataFrameEncoder,
    DateTimeEncoder,
    FloatEncoder,
    Standardizer,
    TensorEncoder,
)

In [None]:
df_encoder = DataFrameEncoder(
    (Standardizer() @ FloatEncoder()), index_encoder=DateTimeEncoder()
)

Tensorizer = TensorEncoder()

encoder = TensorEncoder() @ df_encoder

In [None]:
from tsdm.datasets import ETTh1

ds = ETTh1.dataset

In [None]:
enc = Standardizer() @ FloatEncoder()
enc.fit(ds)

In [None]:
enc.encode(ds)

In [None]:
encoder.fit(ds)

In [None]:
encoded = encoder.encode(ds)

In [None]:
decoded = encoder.decode(encoded)

In [None]:
modified = encoded[0] * 2, encoded[1] + 1
encoder.decode(modified)