# Importing KIWI dataset

### Data Validation

Format

```python
data: tuple[
    dict[
        'description': set[str],
        'variables': list[dict[str, str]],
        'time_format': str,
        'series': dict[int, 
            dict['generating_parameters': 
                 dict[
                     'qsmax': float,
                     'qm': float,
                     'qamax': float,
                     'Yem': float,
                     'Yxsof': float,
                     'Yxa': float,
                     'Yos': float,
                     'Yoa': float,
                     'Yas': float,
                     'Kia': float,
                     'Ks': float,
                     'Ko': float,
                     'Kap': float,
                     'Kis': float,
                     'Ksa': float,
                     'Pamax': float,
                     'F0': float,
                     'mu_set': float,
                     'C_feed': float,
                     'Kp': float,
                ]
            ],
        ],
    ], 
    dict[int, DataFrame[columns=['X', 'S', 'A', 'DOTm', 'V', 'pulse', 'kLa']]]
]
```

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Format


```python
dict[int, # run_id
    dict[int, # experiment_id
         dict[
             'metadata',: DataFrame,                # static
             'setpoints': DataFrame,                # static
             'measurements_reactor',: DataFrame,    # TimeTensor
             'measurements_array',: DataFrame,      # TimeTensor
             'measurements_aggregated': DataFrame,  # TimeTensor
         ]
    ]
]

```

In [None]:
import pickle

import numpy as np
import pandas
import pandas as pd
from cross_validate_kiwi_runs import ReplicateBasedSplitter, create_replicate_dict
from pandas import DataFrame

with open("kiwi_experiments_and_run_355.pk", "rb") as file:
    data = pickle.load(file)

col_run_to_exp = create_replicate_dict(data)
splitter = ReplicateBasedSplitter()

In [None]:
DATA = [
    (data[run][exp] | {"run_id": run, "experiment_id": exp})
    for run in data
    for exp in data[run]
]
DF = DataFrame(DATA).set_index(["run_id", "experiment_id"])

## Raw-Data

In [None]:
tables = {}

for key in (
    "metadata",
    "setpoints",
    "measurements_reactor",
    "measurements_array",
    "measurements_aggregated",
):
    if key == "metadata":
        tables[key] = pd.concat(iter(DF[key])).reset_index(drop=True)
    else:
        tables[key] = (
            pd.concat(iter(DF[key]), keys=DF[key].index)
            .reset_index(level=2, drop=True)
            .reset_index()
        )

tables

# Preprocessing

In [39]:
np.number?

In [32]:
def contains_no_information(series) -> bool:
    return len(series.dropna().unique()) <= 1


def contains_nan_slice(series, slices, two_enough: bool = False) -> bool:
    num_missing = 0
    for idx in slices:
        if pd.isna(series[idx]).all():
            num_missing += 1

    if (num_missing > 0 and not two_enough) or (
        num_missing >= len(slices) - 1 and two_enough
    ):
        print(f"{series.name}: data missing in {num_missing}/{len(slices)} slices!")
        return True
    return False

def float_is_int(series) -> bool:
    mask = pd.notna(series)
    return series[mask].apply(float.is_integer).all()

def is_bool(vals) -> bool:
    dtype = 
    if np.issubdtype(vals.dtype, np.bool_):
        return True
    elif np.issubdtype(vals.dtype, np.integer) and len(vals)==2:
        if np.all(vals==0 ^ vals==1) or np.all(vals==-1 ^ vals==1):
            return Ture
    elif np.issubdtype(vals.dtype, np.floating) and len(vals)==2:
        if np.all(vals==0 ^ vals==1) or np.all(vals==-1 ^ vals==1):
            return True
    elif np.issubdtype(vals.dtype, pandas.StringDtype and len(vals)==2):
        val1, val2 = set(vals)
        val1 = str(val1).lower()
        val2 = str(val2).lower()
        if {val1, val2} in ({"0", "1"}, {"-1", "+1"}, {"-1", "1"}, {"t", "f"}, {"true", "false"}, {"y", "n"}, {"yes", "no"}):
            return True
    return False


def 

def get_true_column_dtypes(table) -> dict[str, str]:
    dtypes = {}
    for col in table:
        series = table[col]
        mask = pd.notna(series)
        vals = series[mask].unique()

def get_integer_cols(df) -> set[str]:
    cols = set()
    for col in table[selected_columns]:
        if np.issubdtype(table[col].dtype, np.integer):
            print(f"Integer column                       : {col}")
            cols.add(col)
        elif np.issubdtype(table[col].dtype, np.floating) and float_is_int(table[col]):
            print(f"Integer column pretending to be float: {col}")
            cols.add(col)
    return cols

def get_useless_cols(df, strict: bool=False) -> set[str]:
    useless_cols = set()
    for col in table:
        s = table[col]
        if col in ("run_id", "experiment_id"):
            continue
        if contains_no_information(s):
            print(f"No information in      {col}")
            useless_cols.add(col)
        elif contains_nan_slice(s, run_masks, two_enough=(not strict)):
            print(f"Missing for some run   {col}")
            useless_cols.add(col)
    return useless_cols

def get_μ_set(s: str):
    if pd.isna(s):
        return s
    s = s.strip().lstrip("µ_set: ").strip()
    percent, s = s.split(", ")
    value = s.strip().rstrip("mM IPTG").strip()
    return percent, value

### MetaData Preprocessing

In [None]:
table = metadata = tables["metadata"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

useless_cols = set()

for col in table:
    s = table[col]
    if col in ("run_id", "experiment_id"):
        continue
    if contains_no_information(s):
        print(f"No information in      {col}")
        useless_cols.add(col)
    elif contains_nan_slice(s, run_masks):
        print(f"Missing for some run   {col}")
        useless_cols.add(col)

# drop the following specific columns
useless_cols |= {
    "folder_id_y",
    "ph_Base_conc",
    "ph_Ki",
    "ph_Kp",
    "ph_Tolerance",
    "pms_id",
}

In [None]:
metadata_columns = {
    "bioreactor_id",
    "container_number",
    "experiment_id",
    "run_id",
    "profile_id",
    "color",
    "profile_name",
    "organism_id",
    "OD_Dilution",
    "run_name",
    "start_time",
    "end_time",
}

remaining_cols = set(metadata.columns) - useless_cols
assert (
    metadata_columns >= set(metadata.columns) - useless_cols
), f"Superfluous {metadata_columns - remaining_cols}"
assert (
    metadata_columns <= set(metadata.columns) - useless_cols
), f"You forgot to check {metadata_columns - remaining_cols}"


metadata_dtypes = {
    "experiment_id": "UInt32",
    "bioreactor_id": "UInt32",
    "container_number": "UInt32",
    "profile_id": "UInt32",
    "color": "string",
    "profile_name": "string",
    "organism_id": "UInt32",
    "run_id": "UInt32",
    "OD_Dilution": "float32",
    "run_name": "string",
    "start_time": "datetime64[ns]",
    "end_time": "datetime64[ns]",
}

assert metadata_columns >= set(
    metadata_dtypes
), f"Superfluous encoing  {set(metadata_dtypes) - metadata_columns}"
assert metadata_columns <= set(
    metadata_dtypes
), f"You forgot to encode {metadata_columns - set(metadata_dtypes)}"

metadata_categoricals = {
    "profile_name": "category",
    "run_name": "category",
    "color": "category",
    "OD_Dilution": "Float32",
}

assert metadata_columns >= set(
    metadata_categoricals
), f"Superfluous encoing {set(metadata_categoricals) - metadata_columns}"

In [None]:
metadata = metadata.astype(metadata_dtypes)
metadata = metadata.astype(metadata_categoricals)
metadata = metadata[metadata_columns]
metadata = metadata.set_index(["run_id", "experiment_id"])

## Setpoint Preprocessing

In [None]:
table = setpoints = tables["setpoints"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

useless_cols = set()

for col in table:
    s = table[col]
    if col in ("run_id", "experiment_id"):
        continue
    if contains_no_information(s):
        print(f"No information in      {col}")
        useless_cols.add(col)
    elif contains_nan_slice(s, run_masks, two_enough=True):
        print(f"Missing for some run   {col}")
        useless_cols.add(col)

# drop the following specific columns
useless_cols

In [None]:
setpoint_columns = set(setpoints.columns)
remaining_cols = setpoint_columns - useless_cols

selected_columns = {
    "experiment_id",
    "run_id",
    "cultivation_age",
    "setpoint_id",
    "unit",
    "Puls_Glucose",
    "StirringSpeed",
    "Feed_glc_cum_setpoints",
    "Flow_Air",
    "InducerConcentration",
}

assert (
    selected_columns >= setpoint_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns}"
assert (
    selected_columns <= setpoint_columns - useless_cols
), f"Superfluous {selected_columns - remaining_cols}"

setpoints_dtypes = {
    "experiment_id": "UInt32",
    "run_id": "UInt32",
    "cultivation_age": "UInt32",
    "setpoint_id": "UInt32",
    "unit": "string",
    "Puls_AceticAcid": "Float32",
    "Puls_Glucose": "Float32",
    "Puls_Medium": "Float32",
    "StirringSpeed": "UInt16",
    "pH": "Float32",
    "Feed_glc_cum_setpoints": "UInt16",
    "Flow_Air": "UInt8",
    "InducerConcentration": "Float32",
    "Flow_Nitrogen": "Float32",
    "Flow_O2": "Float32",
    "Feed_dextrine_cum_setpoints": "Float32",
}

assert (
    set(setpoints_dtypes) <= setpoint_columns
), f"Superfluous encoing  {set(setpoints_dtypes) - setpoint_columns}"
assert (
    set(setpoints_dtypes) >= selected_columns
), f"You forgot to encode {selected_columns - set(setpoints_dtypes)}"

setpoints_categoricals = {
    "unit": "category",
}

assert set(setpoints_categoricals) <= set(
    setpoints_dtypes
), f"Superfluous encoing {set(setpoints_categoricals) - set(setpoints_dtypes)}"

In [None]:
setpoints["unit"] = setpoints["unit"].replace(to_replace="-", value=pd.NA)
setpoints = setpoints.astype(setpoints_dtypes)
setpoints = setpoints.astype(setpoints_categoricals)
setpoints = setpoints[selected_columns]
setpoints = setpoints.set_index(["run_id", "experiment_id"])

##  Measurements Reactor

In [34]:
table = measurements_reactor = tables["measurements_reactor"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

useless_cols = get_useless_cols(table)
integer_cols = get_integer_cols(table)

In [None]:
table_columns = set(table.columns)
remaining_cols = table_columns - useless_cols

In [None]:
selected_columns = {
    "Acetate",
    "Base",
    "Cumulated_feed_volume_glucose",
    "Cumulated_feed_volume_medium",
    "DOT",
    "Fluo_GFP",
    "Glucose",
    "InducerConcentration",
    "OD600",
    "Probe_Volume",
    "Volume",
    "experiment_id",
    "measurement_id",
    "measurement_time",
    "pH",
    "run_id",
    "unit",
}

assert (
    selected_columns >= table_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns}"
assert (
    selected_columns <= table_columns - useless_cols
), f"Superfluous {selected_columns - remaining_cols}"

In [24]:
table[selected_columns].dtypes.sort_index()

In [19]:
table["Base"].unique()

In [None]:
table.dtyp

In [None]:
reactor_dtypes = {
    "Acetate"                       : "Float32",
    "Base"                          : "Int32",
    "Cumulated_feed_volume_glucose" : "",
    "Cumulated_feed_volume_medium"  : "",
    "DOT"                           : "",
    "Fluo_GFP"                      : "",
    "Glucose"                       : "",
    "InducerConcentration"          : "",
    "OD600"                         : "",
    "Probe_Volume"                  : "",
    "Volume"                        : "",
    "experiment_id"                 : "",
    "measurement_id"                : "",
    "measurement_time"              : "",
    "pH"                            : "",
    "run_id"                        : "",
    "unit"                          : "",
}

assert (
    set(setpoints_dtypes) <= setpoint_columns
), f"Superfluous encoing  {set(setpoints_dtypes) - setpoint_columns}"
assert (
    set(setpoints_dtypes) >= selected_columns
), f"You forgot to encode {selected_columns - set(setpoints_dtypes)}"

setpoints_categoricals = {
    "unit": "category",
}

assert set(setpoints_categoricals) <= set(
    setpoints_dtypes
), f"Superfluous encoing {set(setpoints_categoricals) - set(setpoints_dtypes)}"

In [None]:
metadata = metadata.astype(metadata_dtypes)
metadata = metadata.astype(metadata_categoricals)
metadata = metadata[metadata_columns]
metadata = metadata.set_index(["run_id", "experiment_id"])

## Measurements_Array

## Measurements Aggregated