# Improved Dataset Pipeline

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import tsdm
from tsdm.random.stats import data_overview

In [None]:
import pickle

import numpy as np
import pandas as pd
from pandas import DataFrame, Series


def float_is_int(series: Series) -> bool:
    r"""Check if all float values are integers."""
    mask = pd.notna(series)
    return series[mask].apply(float.is_integer).all()


def get_integer_cols(table: DataFrame) -> set[str]:
    r"""Get all columns that contain only integers."""
    cols = set()
    for col in table:
        if np.issubdtype(table[col].dtype, np.integer):
            # __logger__.debug("Integer column                       : %s", col)
            cols.add(col)
        elif np.issubdtype(table[col].dtype, np.floating) and float_is_int(table[col]):
            # __logger__.debug("Integer column pretending to be float: %s", col)
            cols.add(col)
    return cols

In [None]:
ds = tsdm.datasets.KIWI_RUNS(initialize=False)

In [None]:
with open(ds.rawdata_paths, "rb") as file:
    ds.LOGGER.info("Loading raw data from %s", ds.rawdata_paths)
    data = pickle.load(file)

In [None]:
data.keys()

In [None]:
data[439].keys()

In [None]:
data[439][15325].keys()

In [None]:
data[439][15325]["metadata"]

## Metadata

In [None]:
metadata = {
    (outer_key, inner_key): tables["metadata"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

metadata = pd.concat(metadata, names=["run_id", "exp_id"])

mu_sets = metadata["description_x"].str.split(" ", expand=True)
mu_sets.columns = ["name", "percent", "amount", "unit", "chemical"]
mu_sets["percent"] = mu_sets["percent"].str.split("%", expand=True)[0]
metadata["μ_set"] = mu_sets["percent"]
metadata["IPTG"] = mu_sets["amount"]

In [None]:
# fmt: off
column_dtypes = {
    "experiment_id"          : "Int32",
    "bioreactor_id"          : "Int32",
    "container_number"       : "Int32",
    "profile_id"             : "Int32",
    "starter_culture_id"     : "Int32",
    "inactivation_method_id" : "Int32",
    "description_x"          : "string[pyarrow]",
    "color"                  : "string[pyarrow]",
    "profile_name"           : "string[pyarrow]",
    "folder_id_x"            : "Int32",
    "organism_id"            : "Int32",
    "plasmid_id"             : "Int32",
    "medium_id"              : "Int32",
    "description_y"          : "string[pyarrow]",
    "run_id"                 : "Int32",
    "Acetate_Dilution"       : "Int32",
    "Feed_concentration_glc" : "Int32",
    "InducerConcentration"   : "Int32",
    "OD_Dilution"            : "Int32",
    "Stir_Max_Restarts"      : "Int32",
    "pH_correction_factor"   : "Float32",
    "ph_Base_conc"           : "Float32",
    "ph_Ki"                  : "Float32",
    "ph_Kp"                  : "Float32",
    "ph_Tolerance"           : "Float32",
    "run_name"               : "string[pyarrow]",
    "folder_id_y"            : "Int32",
    "pms_id"                 : "Int32",
    "status_id"              : "Int32",
    "start_time"             : "datetime64[ns]",
    "end_time"               : "datetime64[ns]",
    "description"            : "string[pyarrow]",
    "conclusion"             : "string[pyarrow]",
    "container_label"        : "string[pyarrow]",
    "is_template"            : "Int32",
    "Glucose_Dilution"       : "Int32",
    "ph_Acid_conc"           : "Int32",
    # added columns
    "μ_set"                  : "Int32",
    "IPTG"                   : "Float32",
}
# fmt: of
metadata = metadata.astype(column_dtypes)
tmin = metadata["start_time"]
tmax = metadata["end_time"]
data_overview(metadata)

In [None]:
# fmt: off
selected_columns = {
    "experiment_id"          : True,
    "bioreactor_id"          : True,
    "container_number"       : True,
    "profile_id"             : False,
    "description_x"          : False,
    "color"                  : True,
    "profile_name"           : True,
    "plasmid_id"             : True,
    "run_id"                 : True,
    "Feed_concentration_glc" : True,
    "OD_Dilution"            : True,
    "pH_correction_factor"   : True,
    "ph_Tolerance"           : True,
    "run_name"               : False,
    "folder_id_y"            : False,
    "start_time"             : False,
    "end_time"               : False,
    "description"            : False,
    "μ_set"                  : True,
    "IPTG"                   : True,
}
# fmt: on
selected_columns = [key for key, val in selected_columns.items() if val]
metadata = metadata[selected_columns]
data_overview(metadata)

## Setpoints

In [None]:
setpoints = {
    (outer_key, inner_key): tables["setpoints"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

setpoints = pd.concat(setpoints, names=["run_id", "exp_id"], verify_integrity=True)
setpoints = setpoints.reset_index(-1, drop=True)

In [None]:
# fmt: off
column_dtypes = {
    "cultivation_age"             : "Int32",
    "setpoint_id"                 : "Int32",
    "unit"                        : "string[pyarrow]",
    "Feed_glc_cum_setpoints"      : "Int32",
    "Flow_Air"                    : "Int32",
    "InducerConcentration"        : "Float32",
    "Puls_AceticAcid"             : "Int32",
    "Puls_Glucose"                : "Int32",
    "Puls_Medium"                 : "Int32",
    "StirringSpeed"               : "Int32",
    "pH"                          : "Float32",
    "Flow_Nitrogen"               : "Int32",
    "Flow_O2"                     : "Int32",
    "Feed_dextrine_cum_setpoints" : "Int32",
    "Temperature"                 : "Int32",
}
# fmt: on

na_values = {
    "unit": ["-"],
}
setpoints = setpoints.astype(column_dtypes)
setpoints["unit"] = setpoints["unit"].replace("-", pd.NA)
data_overview(setpoints)

In [None]:
# fmt: off
selected_columns = {
    "cultivation_age"             : True,
    "setpoint_id"                 : False,
    "unit"                        : True,
    "Feed_glc_cum_setpoints"      : True,
    "Flow_Air"                    : True,
    "InducerConcentration"        : True,
    "Puls_AceticAcid"             : False,
    "Puls_Glucose"                : False,
    "Puls_Medium"                 : False,
    "StirringSpeed"               : True,
    "pH"                          : True,
    "Flow_Nitrogen"               : False,
    "Flow_O2"                     : False,
    "Feed_dextrine_cum_setpoints" : False,
    "Temperature"                 : False,
}
# fmt: on
selected_columns = [key for key, val in selected_columns.items() if val]
setpoints = setpoints[selected_columns]

## Measurements Reactor

In [None]:
measurements_reactor = {
    (outer_key, inner_key): tables["measurements_reactor"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_reactor = pd.concat(
    measurements_reactor, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_reactor = measurements_reactor.reset_index(-1, drop=True)

## Measurements Array

In [None]:
measurements_array = {
    (outer_key, inner_key): tables["measurements_array"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_array = pd.concat(
    measurements_array, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_array = measurements_array.reset_index(-1, drop=True)

## Measurements Aggregated

In [None]:
measurements_aggregated = {
    (outer_key, inner_key): tables["measurements_aggregated"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_aggregated = pd.concat(
    measurements_aggregated, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_aggregated = measurements_aggregated.reset_index(-1, drop=True)
measurements_aggregated = measurements_aggregated.set_index(
    "measurement_time", append=True
)

In [None]:
# fmt: off
column_dtypes = {
    # "measurement_time"                : "datetime64[ns]",
    "unit"                            : "string[pyarrow]",
    "Flow_Air"                        : "Int32",
    "Flow_Nitrogen"                   : "Int32",
    "Flow_O2"                         : "Int32",
    "StirringSpeed"                   : "Int32",
    "Temperature"                     : "Float32",
    "Acetate"                         : "Float32",
    "Base"                            : "Int32",
    "Cumulated_feed_volume_glucose"   : "Int32",
    "Cumulated_feed_volume_medium"    : "Float32",
    "DOT"                             : "Float32",
    "Fluo_GFP"                        : "Float32",
    "Glucose"                         : "Float32",
    "OD600"                           : "Float32",
    "Probe_Volume"                    : "Int32",
    "pH"                              : "Float32",
    "InducerConcentration"            : "Float32",
    "remark"                          : "string",
    "Volume"                          : "Float32",
    "Acid"                            : "Int32",
}

na_values = {
    "unit" : ["-"],
    "remark" : ["value written to setpoints has been transferred to this table."],
}
measurements_aggregated["unit"].replace("-", pd.NA, inplace=True)
# fmt: on
measurements_aggregated = measurements_aggregated.astype(column_dtypes)
data_overview(measurements_aggregated, index_col="measurement_time")

In [None]:
uniques_per_run_id = measurements_aggregated.groupby("run_id").nunique()
(uniques_per_run_id > 1).sum() > 1

In [None]:
# fmt: off
selected_columns = {
    "unit"                          : False,
    "Flow_Air"                      : True,
    "Flow_Nitrogen"                 : False,
    "Flow_O2"                       : False,
    "StirringSpeed"                 : True,
    "Temperature"                   : True,
    "Acetate"                       : True,
    "Base"                          : True,
    "Cumulated_feed_volume_glucose" : True,
    "Cumulated_feed_volume_medium"  : True,
    "DOT"                           : True,
    "Fluo_GFP"                      : True,
    "Glucose"                       : True,
    "OD600"                         : True,
    "Probe_Volume"                  : True,
    "pH"                            : True,
    "InducerConcentration"          : True,
    "remark"                        : False,
    "Volume"                        : True,
    "Acid"                          : False,
}
selected_columns = [key for key, val in selected_columns.items() if val]
# fmt: on
measurements_aggregated = measurements_aggregated.loc[:, selected_columns]

## Aggregate overlapping measurements excluding duplicates

In [None]:
measurements_aggregated.groupby(["run_id", "exp_id", "measurement_time"]).mean()

In [None]:
uniques_per_timestamp = measurements_aggregated.groupby(
    ["run_id", "exp_id", "measurement_time"]
).nunique()
mask = (uniques_per_timestamp <= 1).all(axis="columns")

uniques_per_timestamp = measurements_aggregated.groupby(
    ["run_id", "exp_id", "measurement_time"]
).nunique()
measurements_aggregated.loc[(439, 15325, "2020-12-09 09:48:38")]

In [None]:
df = (
    measurements_aggregated.loc[mask]
    .groupby(["run_id", "exp_id", "measurement_time"])
    .mean()
)
pd.concat([df, measurements_aggregated.loc[~mask]]).sort_index()

In [None]:
# fmt: off
BOUNDS = {
    "Acetate":              (0, 2.5),       # concentration like
    "Glucose":              (0, 20),        # concentration like
    "DOT":                  (0, 100),       # percent like
    "OD600":                (0, 100),       # percent like
    "FlowAir":              (0, None),      # possibly log-transform
    "Base":                 (0, None),      # possibly log-transform
    "Volume":               (0, None),      # possibly log-transform
    "CumulatedFeedGlucose": (0, None),      # possibly log-transform
    "CumulatedFeedMedium":  (0, None),      # possibly log-transform
    "ProbeVolume":          (0, None),      # possibly log-transform
    "StirringSpeed":        (0, None),      # possibly log-transform
    "InducerConcentration": (0, None),      # possibly log-transform
    "FluoGFP":              (0, 1_000_000),
    "pH":                   (4, 10),        # log scale
    "Temperature":          (20, 45),
}
# fmt: on