# KIWI_RUNS Task Object

Evalutation protocol:


Loss on some values.


There are necessary parameters:

- Observation Horizon : 2h (or: all)
- Forecasting Horizon : 1h
- Stride : 15 / 30 / 60
- t0, tmax




In [None]:
from tsdm.datasets import KIWI_RUNS

In [None]:
ds.clean()

In [None]:
ds = KIWI_RUNS()

In [None]:
264

In [None]:
ds.metadata

In [None]:
ds.units

In [None]:
ds.metadata

In [None]:
ds.timeseries

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pickle

import numpy as np
import pandas
import pandas as pd
from pandas import DataFrame, Series

pd.set_option("display.max_rows", 10)

In [None]:
import tsdm

ds = tsdm.datasets.KIWI_RUNS()

## Stefan's splits

In [None]:
from cross_validate_kiwi_runs import ReplicateBasedSplitter, create_replicate_dict

with open(ds.rawdata_files, "rb") as f:
    experiments_per_run = pickle.load(f)

col_run_to_exp = dict(create_replicate_dict(experiments_per_run))

splitter = ReplicateBasedSplitter()

stefan_splits = []

for train_keys, test_keys in splitter.split(col_run_to_exp):
    stefan_splits.append((train_keys, test_keys))

stefan_splits

In [None]:
from tsdm.datasets import KIWI_RUNS

ds = KIWI_RUNS()
metadata = ds.metadata
timeseries = ds.timeseries

In [None]:
reverse_lookup = {}

for run_id in metadata.index.unique("run_id"):
    colors = metadata["color"][[run_id]]
    for color in colors.unique():
        mask = colors == color
        indices = colors[colors == color].index.tolist()
        reverse_lookup[(color, run_id)] = indices

assert reverse_lookup == col_run_to_exp

## groupby solution

https://stackoverflow.com/a/51329888/9318372

In [None]:
rev = metadata.groupby(["color", "run_id"]).groups
rev = {key: idx.tolist() for key, idx in rev.items()}
assert rev == col_run_to_exp

## Custom splitting logic

In [None]:
from sklearn.model_selection import ShuffleSplit

splitter = ShuffleSplit(n_splits=5, random_state=0, test_size=0.25)
groups = metadata.groupby(["color", "run_id"])
rev_idx = groups.ngroup()
groups = metadata.groupby(["color", "run_id"])
group_idx = groups.ngroup()

splits = DataFrame(index=metadata.index)
for i, (train, test) in enumerate(splitter.split(groups)):
    splits[i] = group_idx.isin(train).map({False: "test", True: "train"})

splits.columns.name = "split"
splits.astype("string").astype("category")

## Loss function

Divide 'Glucose' by 10, 'OD600' by 20, 'DOT' by 100, 'Base' by 200, then use RMSE.

In [None]:
targets = {"Glucose", "OD600", "DOT", "Base"}
assert targets <= set(timeseries.columns)

In [None]:
timeseries.min()

In [None]:
timeseries.max()

In [None]:
timeseries.max() - timeseries.min()

In [None]:
from itertools import product

In [None]:
list(product(range(5), ("train", "test")))

In [None]:
timeseries.dtypes

In [None]:
mask = splits[0] == "train"
idx = splits[0][mask].index

In [None]:
timeseries.reset_index(level=2).loc[idx].set_index(["measurement_time"], append=True)

# Implementation

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

## Splits

In [None]:
from tsdm.tasks import KIWI_RUNS_TASK

TASK = KIWI_RUNS_TASK()

In [None]:
TASK.splits[(4, "train")]

## Preprocessing

### Encoding the metadata

```
Feed_concentration_glc             Int16 → convert to float
OD_Dilution                      Float32 → keep as-is
bioreactor_id                     UInt32 → drop / One-Hot
color                           category → drop / One-Hot
container_number                  UInt32 → drop
end_time                  datetime64[ns] → drop, but use for time-scaling
organism_id                       UInt32 → One-Hot
pH_correction_factor             Float32 → keep as-is
profile_id                        UInt32 → drop
profile_name                    category → drop
run_name                        category → drop
start_time                datetime64[ns] → drop, but use for time-scaling
```

### Consistency check: are the observation timestamps within start_time and end_time reported in metadata?

In [None]:
from tqdm.auto import tqdm

from tsdm.datasets import KIWI_RUNS

In [None]:
ds = KIWI_RUNS()
ts = ds.timeseries
md = ds.metadata

In [None]:
ts = ts.reset_index(level=2)
runs = ts.index.get_level_values("run_id").unique()

times = DataFrame(
    index=md.index, columns=["ts_start", "ts_final", "md_start", "md_final"]
)

for idx in md.index:
    ts_slice = ts.loc[idx]
    md_slice = md.loc[idx]
    times.loc[idx, "ts_start"] = ts.loc[idx, "measurement_time"].min()
    times.loc[idx, "ts_final"] = ts.loc[idx, "measurement_time"].max()
    times.loc[idx, "md_start"] = md.loc[idx, "start_time"]
    times.loc[idx, "md_final"] = md.loc[idx, "end_time"]

times

In [None]:
discr = DataFrame()
discr["MD₀-TS₀"] = times["md_start"] - times["ts_start"]
discr["MDₜ-TSₜ"] = times["md_final"] - times["ts_final"]
discr = discr / np.timedelta64(1, "s")
discr = discr.groupby("run_id").mean()
with pd.option_context("display.float_format", "{:.0f}".format):
    display(discr)

### Fixing the error

In [None]:
ts = KIWI_RUNS().timeseries
md = KIWI_RUNS().metadata

merged = ts[[]].join(md[["start_time", "end_time"]])
time = merged.index.get_level_values("measurement_time")
cond = (merged["start_time"] <= time) & (time <= merged["end_time"])
ts[cond]

## What went wrong?

In [None]:
ts = KIWI_RUNS().timeseries.reset_index(level=2)
md = KIWI_RUNS().metadata

In [None]:
import matplotlib.pyplot as plt

times.loc[475, 16130]

In [None]:
exp = ts.loc[475, 16130]

In [None]:
pandas.isna(exp).mean()

In [None]:
T = exp["measurement_time"]
T = T - T.iloc[0]
DOT = exp["DOT"].astype(float).values
PH = exp["pH"].astype(float).values
TEMP = exp["Temperature"].astype(float).values

In [None]:
exp.set_index("measurement_time")

In [None]:
slc = exp.set_index("measurement_time").loc[:"2021-06-01"]
pandas.isna(slc).mean()

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(T, DOT, T, PH, T, TEMP)
ax.legend(["DOT", "pH", "Temp"])

In [None]:
md = metadata.drop(355)
assert not pandas.isna(md).any(axis=None), "DataFrame contains NANs!!"

In [None]:
drop_vals = [
    "bioreactor_id",
    "color",
    "container_number",
    "end_time",
    "start_time",
    "profile_id",
    "profile_name",
    "run_name",
    "organism_id",
]

In [None]:
md = md.drop(columns=drop_vals).astype("Float32")

In [None]:
md["Feed_concentration_glc"] = md["Feed_concentration_glc"].astype("Float32")

## Encoding in torch

In [None]:
import torch

from tsdm.encoders.functional import time2float

ts = KIWI_RUNS().timeseries.astype("float32")
md = KIWI_RUNS().metadata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float32
ts = ts.reset_index(level=2)  # make measurements regular col

In [None]:
time2float(ts["measurement_time"].values)

In [None]:
T = torch.tensor(time2float(ts["measurement_time"].values), device=device, dtype=dtype)
X = torch.tensor(
    ts.drop(columns=["measurement_time"]).values, device=device, dtype=dtype
)

## Creating DataSetCollection Object

In [None]:
from torch.utils.data import TensorDataset

from tsdm.datasets import DatasetCollection

In [None]:
shared_index = ts.index.unique().values
masks = {idx: ts.index == idx for idx in shared_index}
datasets = {idx: TensorDataset(T[masks[idx]], X[masks[idx]]) for idx in shared_index}

In [None]:
from pandas import Series

In [None]:
s = Series(datasets)

In [None]:
dataset = DatasetCollection(datasets)

In [None]:
some_index = (439, 15325)
dataset[some_index]

In [None]:
dataset[some_index][:10]

## Creating CollectionSampler Object

In [None]:
from torch.utils.data import TensorDataset

from tsdm.random.samplers import CollectionSampler, SequenceSampler

subsamplers = {
    key: SequenceSampler(ds, seq_len=100, shuffle=True) for key, ds in dataset.items()
}
sampler = CollectionSampler(dataset, subsamplers=subsamplers)

In [None]:
sample = next(iter(sampler))
element = dataset[sample]

In [None]:
from tqdm.auto import tqdm

for b in tqdm(sampler): ...

## DataLoader Object

In [None]:
from torch.utils.data import DataLoader

In [None]:
dloader = DataLoader(dataset, sampler=sampler, batch_size=32)

In [None]:
next(iter(dloader))

In [None]:
for batch in tqdm(dloader): ...

## Testing implemented variant

In [None]:
from tsdm.tasks import KIWI_RUNS_TASK

task = KIWI_RUNS_TASK()

In [None]:
task.dataloaders

In [None]:
dloader = task.dataloaders[(0, "train")]

In [None]:
next(iter(dloader))

In [None]:
T.shape, X.shape

In [None]:
for batch in tqdm(dloader): ...

In [None]:
for batch in tqdm(task.dataloaders[(0, "train")]): ...