In [1]:
%load_ext autoreload
%autoreload 2

This notebook differs from the create agg data notebook as that we will add a new parameter to the grid-search; 
namely the `preselection factor`

In [2]:
import itertools
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from plotly_resampler.aggregation import LTTB, MinMaxAggregator
from tqdm.auto import tqdm

sys.path.append("..")
from agg_utils.aggregators import MinMaxLTTB
from agg_utils.path_conf import dataset_dir, loc_data_dir, figure_root_dir


In [3]:
# fmt: off
# Get the data that will be used for benchmarking
btc_series = pd.read_parquet(dataset_dir / "btc_high.parquet").set_index('date')['high']
ball_speed_series = pd.read_parquet(dataset_dir / "ball_speed.parquet").set_index("ts")["v"]
power_series = pd.read_parquet( dataset_dir / "electrical_power.parquet").set_index("ts")["mf03"]
cinecg = pd.read_parquet(dataset_dir / "cinecg.parquet")['ECG']
hf_sine = pd.read_parquet(dataset_dir / "hf_sine.parquet")['sine']
hf_noise = pd.read_parquet(dataset_dir / "hf_noise.parquet")['noise']

data_tuple = [
    # ("sine", hf_sine),
    ("noise", hf_noise),
    ("cinecg", cinecg),
    ("ball", ball_speed_series.iloc[50_000:]),
    ("power", power_series),
    ("btc", btc_series),
]


# Serializing aggregations and corresponding references

In [4]:
agg_data_dir = Path(figure_root_dir / "preselect_data")
agg_data_dir.mkdir(exist_ok=True)


In [5]:
# fmt: off
n_out_grid = np.arange(200, 4001, 20)
n_grid = [50_000, 200_000, 1_000_000]
factor_grid = list(np.arange(1, 12 , step=1))

data_grid = [
    ("sine", hf_sine),
    ("noise", hf_noise),
    ("cinecg", cinecg),
    ("ball", ball_speed_series.iloc[50_000:]),
    ("power", power_series),
    ("btc", btc_series),
]

df_list = []
for (s_name, data), n in tqdm(list(itertools.product(data_grid, n_grid))):
    # Save the reference figure and the x_lim and y_lim
    # The reference figure is the LTTB aggregated figure
    reference = data.iloc[:n]
    save_name = agg_data_dir / f"{s_name}_reference_{n}.parquet"
    reference.reset_index(drop=False).to_parquet(save_name, engine="fastparquet")
    df_list.append([s_name, "reference", n, None, save_name, None])

    # fmt: off
    for n_out, aggregator_class in tqdm(list(itertools.product(n_out_grid, [LTTB, MinMaxAggregator]))):
        s_agg = aggregator_class()._aggregate(reference, n_out)
        save_name = save_name.parent / f"{s_name}_{aggregator_class.__name__}_{n}_{int(n_out)}.parquet"
        s_agg.reset_index(drop=False).to_parquet(save_name, engine="fastparquet")
        df_list.append([s_name, aggregator_class.__name__, n, n_out, save_name, None])

    # TODO? also min_max with a factor?

    for n_out, factor in tqdm(list(itertools.product(n_out_grid, factor_grid))):
        s_agg = MinMaxLTTB()._aggregate(reference, n_out=n_out, minmax_ratio=factor)
        save_name = save_name.parent / f"{s_name}_{MinMaxLTTB.__name__}_{n}_{int(n_out)}_{factor}.parquet"
        s_agg.reset_index(drop=False).to_parquet(save_name, engine="fastparquet")
        df_list.append([s_name, MinMaxLTTB.__name__, n, n_out, save_name, factor])
        # also wave the MinMax-output with a factor
        s_agg = MinMaxAggregator()._aggregate(reference, n_out=n_out * factor)
        save_name = save_name.parent / f"{s_name}_{MinMaxAggregator.__name__}_{n}_{int(n_out)}_{factor}.parquet"
        s_agg.reset_index(drop=False).to_parquet(save_name, engine="fastparquet")
        df_list.append([s_name, MinMaxAggregator.__name__, n, n_out, save_name, factor])

pd.DataFrame(
    df_list,
    columns=["data", "aggregator", "n", "n_out", "path", "factor"],
).to_csv(loc_data_dir / "agg_data_preselect.csv", index=False)


  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]

  0%|          | 0/382 [00:00<?, ?it/s]

  0%|          | 0/2101 [00:00<?, ?it/s]