# Importing KIWI dataset

### Data Validation

Format

```python
data: tuple[
    dict[
        'description': set[str],
        'variables': list[dict[str, str]],
        'time_format': str,
        'series': dict[int, 
            dict['generating_parameters': 
                 dict[
                     'qsmax': float,
                     'qm': float,
                     'qamax': float,
                     'Yem': float,
                     'Yxsof': float,
                     'Yxa': float,
                     'Yos': float,
                     'Yoa': float,
                     'Yas': float,
                     'Kia': float,
                     'Ks': float,
                     'Ko': float,
                     'Kap': float,
                     'Kis': float,
                     'Ksa': float,
                     'Pamax': float,
                     'F0': float,
                     'mu_set': float,
                     'C_feed': float,
                     'Kp': float,
                ]
            ],
        ],
    ], 
    dict[int, DataFrame[columns=['X', 'S', 'A', 'DOTm', 'V', 'pulse', 'kLa']]]
]
```

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pickle

import numpy as np
import pandas
import pandas as pd
from pandas import DataFrame, Series

## Data Encoding

Problem: Columns are encoded in wrong data-types (e.g. categoricals as int or string, floats as string, ints as floats etc.)

Solution: Sequentially figure out data types

Data Type Hirarchy:
1. String-Like (`np.flexible`)
    - strings
    - bytes
2. TimeLike types
    - Timestamp (np.datetime64)
    - Timedelta (np.timedelta64)
3. Numerical (`np.number`)
   - floating (`np.floating`)
       - float
       - complex
   - integer (`np.integer`)
       - signed
       - unsigned
4. Boolean (np.bool_)
5. Pandas special types
    - CategoricalDtype
    - DatetimeTZDtype
    - PeriodDtype
    - IntervalDtype

End goal: whenever it is appropriate to do so, perform the following conversions:

1. Get appropriate Nullable Pandas type
2. Downcast int → uint
3. Downcast int64 → int32 → int16 → int8
3. Downcast float64 → float32
4. Downcast complex128 → complex64
5. Convert to categorical datatype

In [None]:
def subdtypes(dtype):
    subs = dtype.__subclasses__()
    if not subs:
        return dtype
    return [dtype, [subdtypes(dt) for dt in subs]]


subdtypes(np.generic)

### Format


```python
dict[int, # run_id
    dict[int, # experiment_id
         dict[
             'metadata',: DataFrame,                # static
             'setpoints': DataFrame,                # static
             'measurements_reactor',: DataFrame,    # TimeTensor
             'measurements_array',: DataFrame,      # TimeTensor
             'measurements_aggregated': DataFrame,  # TimeTensor
         ]
    ]
]

```

## Raw-Data

### Stefans key selection

In [None]:
from cross_validate_kiwi_runs import ReplicateBasedSplitter, create_replicate_dict

with open("kiwi_experiments_and_run_355.pk", "rb") as f:
    experiments_per_run = pickle.load(f)

col_run_to_exp = create_replicate_dict(experiments_per_run)

splitter = ReplicateBasedSplitter()

for train_keys, test_keys in splitter.split(col_run_to_exp):
    data_train = [experiments_per_run[k[0]][k[1]] for k in train_keys]
    data_test = [experiments_per_run[k[0]][k[1]] for k in test_keys]

### Randolf's data preprocessing

In [None]:
with open("kiwi_experiments_and_run_355.pk", "rb") as file:
    data = pickle.load(file)

col_run_to_exp = create_replicate_dict(data)
splitter = ReplicateBasedSplitter()

DATA = [
    (data[run][exp] | {"run_id": run, "experiment_id": exp})
    for run in data
    for exp in data[run]
]
DF = DataFrame(DATA).set_index(["run_id", "experiment_id"])

In [None]:
tables = {}

for key in (
    "metadata",
    "setpoints",
    "measurements_reactor",
    "measurements_array",
    "measurements_aggregated",
):
    if key == "metadata":
        tables[key] = pd.concat(iter(DF[key])).reset_index(drop=True)
    else:
        tables[key] = (
            pd.concat(iter(DF[key]), keys=DF[key].index)
            .reset_index(level=2, drop=True)
            .reset_index()
        )

tables

# Preprocessing

In [None]:
def contains_no_information(series) -> bool:
    return len(series.dropna().unique()) <= 1


def contains_nan_slice(series, slices, two_enough: bool = False) -> bool:
    num_missing = 0
    for idx in slices:
        if pd.isna(series[idx]).all():
            num_missing += 1

    if (num_missing > 0 and not two_enough) or (
        num_missing >= len(slices) - 1 and two_enough
    ):
        print(f"{series.name}: data missing in {num_missing}/{len(slices)} slices!")
        return True
    return False


def float_is_int(series) -> bool:
    mask = pd.notna(series)
    return series[mask].apply(float.is_integer).all()


def is_bool(vals) -> bool:
    if len(vals) > 2:
        return False

    if np.issubdtype(vals.dtype, np.bool_):
        print(f"Boolean column                       : {col}")
        return True
    elif np.issubdtype(vals.dtype, np.integer):
        # print(vals==0 ^ vals==1)
        if ((vals == 0) ^ (vals == 1)).all() or ((vals == -1) ^ (vals == 1)).all():
            print(f"Boolean column pretending to be integer: {col}")
            return Ture
    elif np.issubdtype(vals.dtype, np.floating):
        if ((vals == 0) ^ (vals == 1)).all() or ((vals == -1) ^ (vals == 1)).all():
            print(f"Boolean column pretending to be float: {col}")
            return True
    elif np.issubdtype(vals.dtype, pandas.StringDtype):
        val1, val2 = set(vals)
        val1 = str(val1).lower()
        val2 = str(val2).lower()
        if {val1, val2} in (
            {"0", "1"},
            {"-1", "+1"},
            {"-1", "1"},
            {"t", "f"},
            {"true", "false"},
            {"y", "n"},
            {"yes", "no"},
        ):
            print(f"Boolean column pretending to be string: {col}")
            return True
    return False


def get_true_column_dtypes(table) -> dict[str, str]:
    for col in table:
        series = table[col]
        mask = pd.notna(series)
        series[mask].unique()


def get_boolean_cols(df) -> set[str]:
    cols = set()
    for col in table:
        series = table[col]
        mask = pd.notna(series)
        vals = series[mask].unique()
        if is_bool(vals):
            cols.add(col)
    return cols


def get_integer_cols(table) -> set[str]:
    cols = set()
    for col in table:
        if np.issubdtype(table[col].dtype, np.integer):
            print(f"Integer column                       : {col}")
            cols.add(col)
        elif np.issubdtype(table[col].dtype, np.floating) and float_is_int(table[col]):
            print(f"Integer column pretending to be float: {col}")
            cols.add(col)
    return cols


def get_useless_cols(table, slices, strict: bool = False) -> set[str]:
    useless_cols = set()
    for col in table:
        s = table[col]
        if col in ("run_id", "experiment_id"):
            continue
        if contains_no_information(s):
            print(f"No information in      {col}")
            useless_cols.add(col)
        elif contains_nan_slice(s, slices, two_enough=(not strict)):
            print(f"Missing for some run   {col}")
            useless_cols.add(col)
    return useless_cols


def get_μ_set(s: str):
    if pd.isna(s):
        return s
    s = s.strip().lstrip("µ_set: ").strip()
    percent, s = s.split(", ")
    value = s.strip().rstrip("mM IPTG").strip()
    return percent, value

### MetaData Preprocessing

In [None]:
table = metadata = tables["metadata"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

table_columns = set(table.columns)
useless_cols = get_useless_cols(table, run_masks) | {
    "folder_id_y",
    "ph_Base_conc",
    "ph_Ki",
    "ph_Kp",
    "ph_Tolerance",
    "pms_id",
}
integer_cols = get_integer_cols(table)
remaining_cols = table_columns - useless_cols;

In [None]:
selected_columns = {
    "Feed_concentration_glc": "float32",
    "OD_Dilution": "float32",
    "bioreactor_id": "UInt32",
    "color": "string",
    "container_number": "UInt32",
    "end_time": "datetime64[ns]",
    "experiment_id": "UInt32",
    "organism_id": "UInt32",
    "pH_correction_factor": "float32",
    "profile_id": "UInt32",
    "profile_name": "string",
    "run_id": "UInt32",
    "run_name": "string",
    "start_time": "datetime64[ns]",
}

categorical_columns = {
    "Feed_concentration_glc": "Int16",
    "OD_Dilution": "Float32",
    "color": "category",
    "pH_correction_factor": "Float32",
    "profile_name": "category",
    "run_name": "category",
}

assert (
    selected_columns.keys() >= remaining_cols
), f"Missing encoding: {remaining_cols - selected_columns.keys()}"

assert (
    selected_columns.keys() <= remaining_cols
), f"Superfluous encoding: {selected_columns.keys() - remaining_cols}"

assert set(categorical_columns) <= set(
    selected_columns
), f"Superfluous encoding: {set(categorical_columns) - set(selected_columns)}"

In [None]:
selected_columns = {
    "experiment_id": "UInt32",
    "bioreactor_id": "UInt32",
    "container_number": "UInt32",
    "profile_id": "UInt32",
    "color": "string",
    "profile_name": "string",
    "organism_id": "UInt32",
    "run_id": "UInt32",
    "OD_Dilution": "float32",
    "Feed_concentration_glc": "float32",
    "run_name": "string",
    "pH_correction_factor": "float32",
    "start_time": "datetime64[ns]",
    "end_time": "datetime64[ns]",
}

categorical_columns = {
    "pH_correction_factor": "Float32",
    "Feed_concentration_glc": "Int16",
    "profile_name": "category",
    "run_name": "category",
    "color": "category",
    "OD_Dilution": "Float32",
}

assert (
    selected_columns.keys() >= table_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns.keys()}"

assert (
    selected_columns.keys() <= table_columns - useless_cols
), f"Superfluous {selected_columns.keys() - remaining_cols}"

assert set(categorical_columns) <= set(
    selected_columns
), f"Superfluous encoing {set(categorical_columns) - set(selected_columns)}"

In [None]:
table = table[selected_columns]
table = table.astype(selected_columns)
table = table.astype(categorical_columns)
metadata = table.set_index(["run_id", "experiment_id"])

## Setpoint Preprocessing

In [None]:
table = setpoints = tables["setpoints"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

table_columns = set(table.columns)
useless_cols = get_useless_cols(table, run_masks)
integer_cols = get_integer_cols(table)
remaining_cols = table_columns - useless_cols

In [None]:
setpoints["unit"].unique()

In [None]:
selected_columns = {
    "experiment_id": "UInt32",
    "run_id": "UInt32",
    "cultivation_age": "UInt32",
    "setpoint_id": "UInt32",
    "unit": "string",
    # "Puls_AceticAcid": "Float32",
    "Puls_Glucose": "Float32",
    # "Puls_Medium": "Float32",
    "StirringSpeed": "UInt16",
    # "pH": "Float32",
    "Feed_glc_cum_setpoints": "UInt16",
    "Flow_Air": "UInt8",
    "InducerConcentration": "Float32",
    # "Flow_Nitrogen": "Float32",
    # "Flow_O2": "Float32",
    # "Feed_dextrine_cum_setpoints": "Float32",
}

categorical_columns = {
    "unit": "category",
}

assert (
    selected_columns.keys() >= table_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns.keys()}"

assert (
    selected_columns.keys() <= table_columns - useless_cols
), f"Superfluous {selected_columns.keys() - remaining_cols}"

assert set(categorical_columns) <= set(
    selected_columns
), f"Superfluous encoing {set(categorical_columns) - set(selected_columns)}"

In [None]:
table["unit"] = table["unit"].replace(to_replace="-", value=pd.NA)
table = table[selected_columns]
table = table.astype(selected_columns)
table = table.astype(categorical_columns)
setpoints = table.set_index(["run_id", "experiment_id"])

##  Measurements Reactor

In [None]:
table = reactor = tables["measurements_reactor"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

table_columns = set(table.columns)
useless_cols = get_useless_cols(table, run_masks)
integer_cols = get_integer_cols(table)
remaining_cols = table_columns - useless_cols

In [None]:
selected_columns = {
    "Acetate": "Float32",
    "Base": "Int16",
    "Cumulated_feed_volume_glucose": "Int16",
    "Cumulated_feed_volume_medium": "Float32",
    "DOT": "Float32",
    "Fluo_GFP": "Float32",
    "Glucose": "Float32",
    "InducerConcentration": "Float32",
    "OD600": "Float32",
    "Probe_Volume": "Int16",
    "Volume": "Float32",
    "experiment_id": "UInt32",
    "measurement_id": "UInt32",
    "measurement_time": "datetime64[ns]",
    "pH": "Float32",
    "run_id": "UInt32",
    "unit": "string",
}

categorical_columns = {
    "unit": "category",
}

assert (
    selected_columns.keys() >= table_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns.keys()}"

assert (
    selected_columns.keys() <= table_columns - useless_cols
), f"Superfluous {selected_columns.keys() - remaining_cols}"

assert set(categorical_columns) <= set(
    selected_columns
), f"Superfluous encoing {set(categorical_columns) - set(selected_columns)}"

In [None]:
table["unit"] = table["unit"].replace(to_replace="-", value=pd.NA)
table = table[selected_columns]
table = table.astype(selected_columns)
table = table.astype(categorical_columns)
reactor = table.set_index(["run_id", "experiment_id"])

## Measurements_Array

In [None]:
table = array = tables["measurements_array"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

table_columns = set(table.columns)
useless_cols = get_useless_cols(table, run_masks)
integer_cols = get_integer_cols(table)
remaining_cols = table_columns - useless_cols

In [None]:
selected_columns = {
    "run_id": "UInt32",
    "experiment_id": "UInt32",
    "measurement_time": "datetime64[ns]",
    "measurement_id": "UInt32",
    "unit": "string",
    "Flow_Air": "Float32",
    # "Flow_Nitrogen"      :         "float64",
    # "Flow_O2"            :         "float64",
    "StirringSpeed": "Int16",
    "Temperature": "Float32",
}

categorical_columns = {
    "unit": "category",
}

assert (
    selected_columns.keys() >= table_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns.keys()}"

assert (
    selected_columns.keys() <= table_columns - useless_cols
), f"Superfluous {selected_columns.keys() - remaining_cols}"

assert set(categorical_columns) <= set(
    selected_columns
), f"Superfluous encoing {set(categorical_columns) - set(selected_columns)}"

In [None]:
table["unit"] = table["unit"].replace(to_replace="-", value=pd.NA)
table = table[selected_columns]
table = table.astype(selected_columns)
table = table.astype(categorical_columns)
array = table.set_index(["run_id", "experiment_id"])

In [None]:
array.dropna(how="all").groupby(["run_id", "experiment_id", "measurement_time"]).mean()

## Measurements Aggregated

In [None]:
table = aggregated = tables["measurements_aggregated"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

table_columns = set(table.columns)
useless_cols = get_useless_cols(table, run_masks)
integer_cols = get_integer_cols(table)
remaining_cols = table_columns - useless_cols

In [None]:
selected_columns = {
    "run_id": "UInt32",
    "experiment_id": "UInt32",
    "measurement_time": "datetime64[ns]",
    "unit": "string",
    "Flow_Air": "Float32",
    # "Flow_Nitrogen"                 :          "Float32",
    # "Flow_O2"                       :          "Int32",
    "StirringSpeed": "Int16",
    "Temperature": "Float32",
    "Acetate": "Float32",
    # "Acid"                          :          "Float32",
    "Base": "Int16",
    "Cumulated_feed_volume_glucose": "Int16",
    "Cumulated_feed_volume_medium": "Float32",
    "DOT": "Float32",
    # "Fluo_CFP"                      :          "Float32",
    # "Fluo_RFP"                      :          "Float32",
    # "Fluo_YFP"                      :          "Float32",
    "Glucose": "Float32",
    "OD600": "Float32",
    "Probe_Volume": "Int16",
    "pH": "Float32",
    "Fluo_GFP": "Float32",
    "InducerConcentration": "Float32",
    # "remark"                        :           "string",
    "Volume": "Float32",
}

categorical_columns = {"unit": "category"}

assert (
    selected_columns.keys() >= table_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns.keys()}"

assert (
    selected_columns.keys() <= table_columns - useless_cols
), f"Superfluous {selected_columns.keys() - remaining_cols}"

assert set(categorical_columns) <= set(
    selected_columns
), f"Superfluous encoing {set(categorical_columns) - set(selected_columns)}"

In [None]:
table["unit"] = table["unit"].replace(to_replace="-", value=pd.NA)
# aggregated = aggregated.astype(aggregated_dtypes)
# aggregated = aggregated.astype(aggregated_categoricals)
table = table[selected_columns].astype(selected_columns)
aggregated = table.set_index(["run_id", "experiment_id"])

## TimeSeries

In [None]:
table = aggregated.copy()
table = table.drop(columns="unit")
table = table.groupby(["run_id", "experiment_id", "measurement_time"]).mean()

In [None]:
table = table.astype("float32")
table = table.reset_index()
table = table.astype("float32")

runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

table_columns = set(table.columns)
useless_cols = get_useless_cols(table, run_masks)
integer_cols = get_integer_cols(table)
remaining_cols = table_columns - useless_cols

In [None]:
aggregated.drop(columns="unit").groupby([
    "run_id", "experiment_id", "measurement_time"
]).mean()

In [None]:
import tsdm

In [None]:
tsdm.datasets.KIWI_RUNS.metadata

In [None]:
tsdm.datasets.KIWI_RUNS.dataset

In [None]:
from tsdm.tasks import KIWI_RUNS_TASK

In [None]:
task = KIWI_RUNS_TASK()
task.split_idx

In [None]:
ts, md = task.splits((2, "test"))

In [None]:
ts

In [None]:
md

## Testing Implementation

In [None]:
from tsdm.datasets import KIWI_RUNS

In [None]:
KIWI_RUNS

In [None]:
md = KIWI_RUNS.metadata

In [None]:
# remove run_id 355
md = md.drop(355)

In [None]:
# now there are no missing values!
pandas.isna(md).sum()

In [None]:
# almost all data is highly compressible.
Series({col: len(md[col].unique()) for col in md})

In [None]:
md.dtypes

In [None]:
pd.isna(md).sum()

In [None]:
Regarding  # We see that profile_id is useless however, since
# it is a categorical that is different for every input

In [None]:
[len]KIWI_RUNS.metadata["profile_name"].unique()

Feed_concentration_glc             Int16 → convert to float
OD_Dilution                      Float32 → keep as-is
bioreactor_id                     UInt32 → drop / One-Hot
color                           category → drop / One-Hot
container_number                  UInt32 → drop
end_time                  datetime64[ns] → drop, but use for time-scaling
organism_id                       UInt32 → One-Hot
pH_correction_factor             Float32 → keep as-is
profile_id                        UInt32 → drop
profile_name                    category → drop
run_name                        category → drop
start_time                datetime64[ns] → drop, but use for time-scaling