In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt

# recommended styles: 'bmh' 'fivethirtyeight' 'ggplot' 'seaborn-darkgrid'
plt.style.use("bmh")
plt.rcParams["axes.axisbelow"] = True

In [None]:
import numpy as np
import pandas
import pandas as pd
from pandas import DataFrame, Series, Timedelta

from tsdm.utils.strings import snake2camel

rng = np.random.default_rng()
np.set_printoptions()

## Helper Functions

In [None]:
def data_overview(df: DataFrame):
    overview = DataFrame(index=df.columns)
    mask = pandas.isna(df)
    overview["# datapoints"] = (~mask).sum()
    overview["% missing"] = (mask.mean() * 100).round(2)
    overview["min"] = df.min().round(2)
    overview["mean"] = df.mean().round(2)
    overview["std"] = df.std().round(2)
    overview["max"] = df.max().round(2)
    # overview["unit"] = units[df.columns]
    freq = {}
    for col in df:
        mask = pandas.notna(df[col])
        time = pandas.to_timedelta(df["timedelta"][mask], unit="s")
        freq[col] = time.mean()
    overview["freq"] = Series(freq).round("s").dt.to_pytimedelta().astype(str)
    overview.index.name = units.index.name
    return overview

In [None]:
from tsdm.datasets import KIWI_RUNS

data = KIWI_RUNS.timeseries.copy()
units = KIWI_RUNS.units.copy()
units["unit"] = units["unit"].cat.add_categories("s")
units.at["timedelta"] = "s"
units.at["freq"] = "s"
units

In [None]:
data["timedelta"] = Series(float("nan"), dtype="timedelta64[ns]")
for run_exp in data.reset_index(level=2).index.unique():
    time = Series(data.loc[run_exp].index)
    data.loc[run_exp, "timedelta"] = (time.diff() / Timedelta("1s")).values
data

In [None]:
data = data.astype("float32")
overviews = {}
for idx, slc in data.groupby(["run_id", "experiment_id"]):
    overviews[idx] = data_overview(slc)

In [None]:
omniview = pandas.concat(overviews, names=["run_id", "experiment_id"])

In [None]:
omniview.groupby("variable").mean().round(2)

In [None]:
overview = data_overview(data.reset_index(level=[0, 1], drop=True))
overview["unit"] = units.loc[KIWI_RUNS.timeseries.columns]

with pd.option_context("display.float_format", "{:,.2f}".format):
    display(overview)

## Some prep work

In [None]:
from tsdm.datasets import KIWI_RUNS

data = KIWI_RUNS.timeseries.copy()
units = KIWI_RUNS.units

## Overview Raw Data

In [None]:
overview = data_overview(data.reset_index(level=[0, 1], drop=True))
with pd.option_context("display.float_format", "{:,.2f}".format):
    display(overview)

In [None]:
fig, axes = plt.subplots(ncols=4, nrows=4, figsize=(12, 12))
data["runtime"] = data["runtime"].diff()
for col, ax in zip(data, axes.flatten()):
    vals = data[col]
    mask = pandas.notna(vals)
    ax.hist(vals[mask], bins=100, density=True)
    ax.set_title(col)
    # ax.set_xscale("symlog")
    ax.set_yscale("log")

In [None]:
for col in ["OD600", "DOT", "Acetate", "Glucose"]:
    print(data[col][data[col] < 0])

## Overview cleaned data

In [None]:
from tsdm.datasets import KIWI_RUNS

data = KIWI_RUNS.dataset
data = data.rename(columns={col: snake2camel(col) for col in data})

In [None]:
for run_exp in data.reset_index(level=2).index.unique():
    time = data.loc[run_exp].index
    td = (time - time.min()) / Timedelta("1h")
    data.loc[run_exp, "runtime"] = td

overview = data_overview(data.reset_index(level=[0, 1], drop=True))
with pd.option_context("display.float_format", "{:,.2f}".format):
    display(overview)

In [None]:
fig, axes = plt.subplots(ncols=4, nrows=4, figsize=(12, 12))

for col, ax in zip(data, axes.flatten()):
    vals = data[col]
    mask = pandas.notna(vals)
    ax.hist(vals[mask], bins=100)
    ax.set_title(col)
    ax.set_xscale("symlog")
    # ax.set_yscale("log")

## Overview Task data

pretty much the same as cleaned but without run 355

In [None]:
from tsdm.tasks import KIWI_RUNS_TASK

data = KIWI_RUNS_TASK().timeseries
data = data.rename(columns={col: snake2camel(col) for col in data})
for run_exp in data.reset_index(level=2).index.unique():
    time = data.loc[run_exp].index
    td = (time - time.min()) / Timedelta("1h")
    data.loc[run_exp, "runtime"] = td

overview = data_overview(data.reset_index(level=[0, 1], drop=True))
with pd.option_context("display.float_format", "{:,.2f}".format):
    display(overview)

In [None]:
fig, axes = plt.subplots(ncols=5, nrows=3, figsize=(16, 9))

for col, ax in zip(data, axes.flatten()):
    vals = data[col]
    mask = pandas.notna(vals)
    ax.hist(vals[mask], bins=50)
    ax.set_title(col)
    ax.set_xscale("symlog")
    ax.set_yscale("log")
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.grid(axis="x")