In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import numpy as np
import pandas as pd
import torch
from pytest import mark, skip

from tsdm.encoders.numerical import (
    BoundaryEncoder,
    LinearScaler,
    MinMaxScaler,
    StandardScaler,
    get_broadcast,
    get_reduced_axes,
)

logging.basicConfig(level=logging.INFO)
__logger__ = logging.getLogger(__name__)

In [None]:
X = pd.DataFrame(np.random.rand(5, 3), columns=["a", "b", "c"])
encoder = StandardScaler(axis=-1)
encoder.fit(X)
encoded = encoder.encode(X)

In [None]:
Encoder = StandardScaler

X = pd.DataFrame(np.random.rand(5, 3), columns=["a", "b", "c"])
encoder = Encoder()

# validate fitting
encoder.fit(X)
assert encoder.params[0].shape == (3,), f"{encoder.params}"

# validate encoding
encoded = encoder.encode(X)
assert (
    isinstance(encoded, pd.DataFrame)
    and encoded.shape == X.shape
    and encoded.columns.equals(X.columns)
    and encoded.index.equals(X.index)
)
if Encoder is MinMaxScaler:
    assert all(encoded.min() >= 0.0)
    assert all(encoded.max() <= 1.0)
if Encoder is StandardScaler:
    assert np.allclose(encoded.mean(), 0.0)
    assert np.allclose(encoded.std(ddof=0), 1.0)

# validate decoding
decoded = encoder.decode(encoded)
pd.testing.assert_frame_equal(X, decoded)
assert np.allclose(X, decoded)

In [None]:
np.std(X)

In [None]:
encoder.stdv == X.std(ddof=0)

In [None]:
encoded

In [None]:
m = np.mean(X, axis=0)
np.sqrt(np.nanmean((X - m) ** 2, axis=0))

In [None]:
np.std(X, axis=0)

In [None]:
encoded.std(ddof=0)

In [None]:
x = torch.tensor([1,2,3, 4.5, float("nan")])

In [None]:
np.array([1,2,3])[:, None]

In [None]:
X = pd.Series([-1.0, 1.2, 2.7, 3.0], name="foo")
encoder = Encoder()

# validate fitting
encoder.fit(X)
assert encoder.params[0].shape == (), f"{encoder.params}"

# validate encoding
encoded = encoder.encode(X)
assert (
    isinstance(encoded, pd.Series)
    and encoded.shape == X.shape
    and encoded.name == X.name
    and encoded.index.equals(X.index)
)
if Encoder is MinMaxScaler:
    assert all(encoded.min() >= 0.0)
    assert all(encoded.max() <= 1.0)
if Encoder is StandardScaler:
    assert np.allclose(encoded.mean(), 0.0)
    assert np.allclose(encoded.std(ddof=0), 1.0)

# validate decoding
decoded = encoder.decode(encoded)
pd.testing.assert_series_equal(X, decoded)
assert np.allclose(X, decoded)

In [None]:
s = pd.Series(np.arange(5) * np.timedelta64(1, "s"))

In [None]:
s / s

In [None]:
s = pd.Series(5, range(int(1e6)))

%timeit s.div(s).fillna(1)

In [None]:
%timeit s.pow(0)

In [None]:
np.array(-1)**0

In [None]:
np.array(float("nan"))**0

In [None]:
X = pd.Series([-1.0, 1.2, 2.7, 3.0], name="foo")
encoder = Encoder(axis=-1)
encoder.fit(X)
encoded = encoder.encode(X)
assert (
    isinstance(encoded, pd.Series)
    and encoded.shape == X.shape
    and encoded.name == X.name
    and encoded.index.equals(X.index)
)
decoded = encoder.decode(encoded)
pd.testing.assert_series_equal(X, decoded)
assert np.allclose(X, decoded)
assert encoder.params[0].shape == (3,), f"{encoder.params}"

if Encoder is MinMaxScaler:
    assert all(encoded.min() >= 0.0)
    assert all(encoded.max() <= 1.0)
if Encoder is StandardScaler:
    assert np.allclose(encoded.mean(), 0.0)
    assert np.allclose(encoded.std(ddof=0), 1.0)

In [None]:
encoder.stdv

In [None]:
X.std()

In [None]:
raise

In [None]:
encoded.mean()

In [None]:
raise

In [None]:
pd.Series(np.arange(4) *np.timedelta64(1, "s"))

In [None]:
self = encoder

In [None]:
xmin = self.xmin  # [broadcast]
xmax = self.xmax  # [broadcast]
ymin = self.ymin  # [broadcast]
ymax = self.ymax  # [broadcast]
xbar = self.xbar  # [broadcast]
ybar = self.ybar  # [broadcast]
scale = self.scale  # [broadcast]

y = (x - xbar) * scale + ybar

In [None]:
if self.safe_computation:
    # ensure the conditions
    # x < xₘᵢₙ ⟹ y < yₘᵢₙ  ∧  x > xₘₐₓ ⟹ y > yₘₐₓ  ∧  x∈[xₘᵢₙ, xₘₐₓ] ⟹ y∈[yₘᵢₙ, yₘₐₓ]
    y = self.where(x < xmin, self.clip(y, None, ymin), y)
    y = self.where(x > xmax, self.clip(y, ymax, None), y)
    y = self.where((x >= xmin) & (x <= xmax), self.clip(y, ymin, ymax), y)

In [None]:
(x >= xmin) & (x <= xmax)

In [None]:
y.clip(ymin, ymax, axis=1)

In [None]:
self.clip(y, ymin, ymax)

In [None]:
self.where((x >= xmin) & (x <= xmax), self.clip(y, ymin, ymax), y)

In [None]:
ymin

In [None]:
y.clip(None, ymin, axis=1).where(x < xmin, y, axis=1)

In [None]:
y = self.where(x < xmin, y.clip(None, ymin, axis=1), y)

In [None]:
(X - encoder.xbar)*encoder.scale + 

In [None]:
self = encoder
dx = self.xmax - self.xmin
dy = self.ymax - self.ymin
dy/dx

In [None]:
raise

In [None]:
encoder.params[0]

In [None]:
X.columns.equals?

In [None]:
Encoder = MinMaxScaler
tensor_type = torch.tensor

In [None]:
LOGGER = __logger__.getChild(Encoder.__name__)
LOGGER.info("Testing.")

LOGGER.info("Testing without batching.")
X = np.random.rand(3)
X = tensor_type(X)
encoder = Encoder()
encoder.fit(X)
encoded = encoder.encode(X)
decoded = encoder.decode(encoded)
assert np.allclose(X, decoded)
assert encoder.params[0].shape == (), f"{encoder.params}"

LOGGER.info("Testing with single batch-dim.")
X = np.random.rand(3, 5)
X = tensor_type(X)
encoder = Encoder(axis=-1)
encoder.fit(X)
encoded = encoder.encode(X)
decoded = encoder.decode(encoded)
assert np.allclose(X, decoded)
assert encoder.params[0].shape == (5,), f"{encoder.params}"

LOGGER.info("Testing slicing.")
encoder = encoder[2]  # select the third encoder
Y = encoded
# encoder.fit(X[:, 2])
encoded = encoder.encode(X[:, 2])
decoded = encoder.decode(encoded)
assert np.allclose(Y[:, 2], encoded)
assert np.allclose(X[:, 2], decoded)
assert encoder.params[0].shape == ()

In [None]:
X

In [None]:
encoder.encode(X)

In [None]:
encoder.where

In [None]:
LOGGER.info("Testing with single batch-dim.")
X = np.random.rand(3, 5)
X = tensor_type(X)
encoder = Encoder(axis=-1)
encoder.fit(X)
encoded = encoder.encode(X)
decoded = encoder.decode(encoded)
assert np.allclose(X, decoded)
assert encoder.params[0].shape == (5,), f"{encoder.params}"

In [None]:
LOGGER.info("Testing slicing.")
encoder = encoder[2]  # select the third encoder
Y = encoded
# encoder.fit(X[:, 2])
encoded = encoder.encode(X[:, 2])
decoded = encoder.decode(encoded)
assert np.allclose(Y[:, 2], encoded)
assert np.allclose(X[:, 2], decoded)
assert encoder.params[0].shape == ()

In [None]:
from tsdm.utils.backends import KernelProvider, get_backend

In [None]:
get_backend(encoder.params)

In [None]:
isinstance(encoder.params[0],

In [None]:
pd.DataFrame(pd.Series([1, 2, 3], dtype="float[pyarrow]"))

In [None]:
pd.Series([1, 2, 3]) > np.array([1.2])

In [None]:
pd.Series([1, 2, 3]) > np.array([1.2])

In [None]:
pd.Series([1, 2, 3]).values > np.array([1.2])

In [None]:
pd.DataFrame(np.random.randn(7, 2)).values > np.array([0.0])

In [None]:
np.array([1.2]).shape

In [None]:
import numpy as np
import pandas as pd

val = np.array([0.0])  # shape: (1,)

test_shapes = [(1,), (7,), (1, 1), (7, 1), (1, 7)]
for shape in test_shapes:
    data = np.random.randn(*shape)
    assert np.all((pd.DataFrame(data) > val).values.squeeze() == (data > val).squeeze())

# however:
data = np.random.randn(7)

pd.Series(data) > val  # ✘

In [None]:
pd.DataFrame(data) > val

In [None]:
(pd.DataFrame(data) > val).values.squeeze()

In [None]:
(data > val)

In [None]:
pd.DataFrame(np.random.randn(3, 4, 5, 6))

In [None]:
data

In [None]:
np.random.randn(7).item()

In [None]:
val.squeeze()

In [None]:
from typing import NamedTuple

In [None]:
class Point(NamedTuple):
    x: int
    y: int
    z: int = 1

In [None]:
Point(0, 0) + (0, 0)