# Improved Dataset Pipeline

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import tsdm
from tsdm.random.stats import data_overview

In [None]:
import pickle

import pandas as pd

In [None]:
ds = tsdm.datasets.KIWI_RUNS(initialize=False)

In [None]:
with open(ds.rawdata_paths, "rb") as file:
    ds.LOGGER.info("Loading raw data from %s", ds.rawdata_paths)
    data = pickle.load(file)

In [None]:
data.keys()

In [None]:
data[439].keys()

In [None]:
data[439][15325].keys()

In [None]:
data[439][15325]["metadata"]

## Metadata

In [None]:
metadata = {
    (outer_key, inner_key): tables["metadata"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

metadata = pd.concat(metadata, names=["run_id", "exp_id"])

## Setpoints

In [None]:
# fmt: off
column_dtypes = {
    "cultivation_age"             : "int64",
    "setpoint_id"                 : "int64",
    "unit"                        : "object",
    "Feed_glc_cum_setpoints"      : "float64",
    "Flow_Air"                    : "float64",
    "InducerConcentration"        : "float64",
    "Puls_AceticAcid"             : "float64",
    "Puls_Glucose"                : "float64",
    "Puls_Medium"                 : "float64",
    "StirringSpeed"               : "float64",
    "pH"                          : "float64",
    "Flow_Nitrogen"               : "float64",
    "Flow_O2"                     : "float64",
    "Feed_dextrine_cum_setpoints" : "float64",
    "Temperature"                 : "float64",
}
# fmt: on

In [None]:
setpoints = {
    (outer_key, inner_key): tables["setpoints"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

setpoints = pd.concat(setpoints, names=["run_id", "exp_id"], verify_integrity=True)
setpoints = setpoints.reset_index(-1, drop=True)

setpoints.columns == [
    "cultivation_age",
    "setpoint_id",
    "unit",
    "Feed_glc_cum_setpoints",
    "Flow_Air",
    "InducerConcentration",
    "Puls_AceticAcid",
    "Puls_Glucose",
    "Puls_Medium",
    "StirringSpeed",
    "pH",
    "Flow_Nitrogen",
    "Flow_O2",
    "Feed_dextrine_cum_setpoints",
    "Temperature",
]

In [None]:
data_overview(setpoints)

## Measurements Reactor

In [None]:
measurements_reactor = {
    (outer_key, inner_key): tables["measurements_reactor"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_reactor = pd.concat(
    measurements_reactor, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_reactor = measurements_reactor.reset_index(-1, drop=True)

## Measurements Array

In [None]:
measurements_array = {
    (outer_key, inner_key): tables["measurements_array"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_array = pd.concat(
    measurements_array, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_array = measurements_array.reset_index(-1, drop=True)

## Measurements Aggregated

In [None]:
measurements_aggregated = {
    (outer_key, inner_key): tables["measurements_aggregated"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_aggregated = pd.concat(
    measurements_aggregated, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_aggregated = measurements_aggregated.reset_index(-1, drop=True)