In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import numpy as np
import pandas

from tsdm.datasets import Electricity

In [None]:
x = Electricity()

In [None]:
pandas.date_range("2011", "2015", freq="1Y")

In [None]:
ds = Electricity().dataset
x = ds.copy()
x = x.resample("1h").sum()
mask = x == 0.0
x[mask] = float("nan")

ymin = x.min().min()
ymax = x.max().max()
xmin = np.datetime64("2010-10")
xmax = np.datetime64("2015-04")

In [None]:
xlabels = dates = [str(i) for i in range(2011, 2016)]
xticks = np.array(dates).astype(np.datetime64)
yticks = [1, 1000, 1000000]

Every year in March time change day (which has only 23 hours) the values between 1:00 am and 2:00 am are zero for all points. Every year in October time change day (which has 25 hours) the values between 1:00 am and 2:00 am aggregate the consumption of two hours. 

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.style.use("bmh")


def make_figure(cols):
    fig, axes = plt.subplots(
        ncols=3,
        nrows=8,
        figsize=(8.27, 11.69),
        constrained_layout=True,
        sharex=True,
        sharey=True,
    )
    for col, ax in zip(cols, axes.flatten()):
        data = x[col].rolling(4).sum()  # kWH aggregation
        data = data.rolling(24 * 7).mean()  # weekly aggregation
        ax.semilogy(data.index, data.values, lw=1)
        ax.set_ylim(0.1, 1_000_000)
        ax.set_xlim(xmin, xmax)
        ax.set_title(col)
        ax.set_yticks(yticks)
        ax.set_xticks(xticks, labels=xlabels)
    return fig, axes


fig, _ = make_figure(ds.columns)
fig.savefig("Electricity.pdf", orientation="portrait")

In [None]:
%matplotlib agg
from matplotlib.backends.backend_pdf import PdfPages
from tqdm.auto import trange

plt.style.use("bmh")

L = len(ds.columns)
B = 24

with PdfPages("electricity-booklet.pdf") as pdf:
    for idx in trange(0, L, B):
        cols = ds.columns[idx : idx + B]
        fig, _ = make_figure(cols)
        pdf.savefig(fig)
        plt.close(fig)

### 2.1 Triplet Format

The data is represented as a set of triplets (time, variable, value). All NaNs are dropped.

In [None]:
from tsdm.encoders.functional import (
    make_dense_triplets,
    make_masked_format,
    make_sparse_triplets,
)

In [None]:
dense_x = make_dense_triplets(x)
dense_x

### 2.2 Sparse Triplet format

The same as before, but the variable tensor is encoded in a one-hot fashion, and the tensor is stored as a sparse tensor

In [None]:
sparse_x = make_sparse_triplets(x)
sparse_x

### 2.3 Masked Format

Here we get 3 tensors:

- x: the original data
- m: a boolean mask, 1: value observed, 0: value not observed (NaN)
- d: time since the channel was last observed

In [None]:
x, m, d = make_masked_format(x)
display(x, m, d)

## 3. Visualizing the data

In [None]:
df = Electricity().dataset
ΔT = np.diff(df.index)
Δt = ΔT[0].astype("timedelta64[m]")
assert np.all(ΔT == Δt)
N, M = df.shape
# remove first year from the data (useless zeros)
span = np.timedelta64(365, "D") // Δt
df = df.iloc[span:]

In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from matplotlib import pyplot as plt

fig, ax = plt.subplots(
    nrows=4, ncols=2, figsize=(24, 9), tight_layout=True, sharex="col", sharey=True
)
ax[1, 0].set_title("1h rolling average")
ax[1, 0].set_title("24h rolling average")
ax[2, 0].set_title("7d rolling average")
selection = np.random.randint(low=0, high=M, size=5)
# selection = [319]

a = np.datetime64("2013-01-01")
b = np.datetime64("2013-02-01")
mask = (df.index >= a) & (df.index <= b)


for k, timedelta in enumerate(
    (Δt, np.timedelta64(1, "h"), np.timedelta64(24, "h"), np.timedelta64(7, "D"))
):
    for l in range(2):
        if l == 0:
            data = df.rolling(window=timedelta // Δt, min_periods=1, axis=0).mean()
        if l == 1:
            a = np.datetime64("2013-01-01")
            b = np.datetime64("2013-02-01")
            mask = (df.index >= a) & (df.index <= b)
            data = (
                df[mask].rolling(window=timedelta // Δt, min_periods=1, axis=0).mean()
            )

        for col in data.iloc[:, selection]:
            ax[k, l].plot(data.index, data[col])
        ax[k, l].set_title(f"{timedelta}-rolling average")
        ax[k, l].set_ylabel("electricity consumption in kW")