# Improved Dataset Pipeline

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pickle

import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import tsdm
from tsdm.random.stats import data_overview

In [None]:
# fmt: off
column_dtypes = {
    "metadata": {
        # "experiment_id"          : "Int32",
        "bioreactor_id"          : "Int32",
        "container_number"       : "Int32",
        "profile_id"             : "Int32",
        "starter_culture_id"     : "Int32",
        "inactivation_method_id" : "Int32",
        "description_x"          : "string[pyarrow]",
        "color"                  : "string[pyarrow]",
        "profile_name"           : "string[pyarrow]",
        "folder_id_x"            : "Int32",
        "organism_id"            : "Int32",
        "plasmid_id"             : "Int32",
        "medium_id"              : "Int32",
        "description_y"          : "string[pyarrow]",
        # "run_id"                 : "Int32",
        "Acetate_Dilution"       : "Int32",
        "Feed_concentration_glc" : "Int32",
        "InducerConcentration"   : "Int32",
        "OD_Dilution"            : "Int32",
        "Stir_Max_Restarts"      : "Int32",
        "pH_correction_factor"   : "Float32",
        "ph_Base_conc"           : "Float32",
        "ph_Ki"                  : "Float32",
        "ph_Kp"                  : "Float32",
        "ph_Tolerance"           : "Float32",
        "run_name"               : "string[pyarrow]",
        "folder_id_y"            : "Int32",
        "pms_id"                 : "Int32",
        "status_id"              : "Int32",
        "start_time"             : "datetime64[ns]",
        "end_time"               : "datetime64[ns]",
        "description"            : "string[pyarrow]",
        "conclusion"             : "string[pyarrow]",
        "container_label"        : "string[pyarrow]",
        "is_template"            : "Int32",
        "Glucose_Dilution"       : "Int32",
        "ph_Acid_conc"           : "Int32",
        # added columns
        "μ_set"                  : "Int32",
        "IPTG"                   : "Float32",
    },
    "timeseries": {
        # "measurement_time"              : "datetime64[ns]",
        "unit"                            : "string[pyarrow]",
        "Flow_Air"                        : "Int32",
        "Flow_Nitrogen"                   : "Int32",
        "Flow_O2"                         : "Int32",
        "StirringSpeed"                   : "Int32",
        "Temperature"                     : "Float32",
        "Acetate"                         : "Float32",
        "Base"                            : "Int32",
        "Cumulated_feed_volume_glucose"   : "Int32",
        "Cumulated_feed_volume_medium"    : "Float32",
        "DOT"                             : "Float32",
        "Fluo_GFP"                        : "Float32",
        "Glucose"                         : "Float32",
        "OD600"                           : "Float32",
        "Probe_Volume"                    : "Int32",
        "pH"                              : "Float32",
        "InducerConcentration"            : "Float32",
        "remark"                          : "string",
        "Volume"                          : "Float32",
        "Acid"                            : "Int32",
    },
    "setpoints" : {
        "cultivation_age"             : "Int32",
        "setpoint_id"                 : "Int32",
        "unit"                        : "string[pyarrow]",
        "Feed_glc_cum_setpoints"      : "Int32",
        "Flow_Air"                    : "Int32",
        "InducerConcentration"        : "Float32",
        "Puls_AceticAcid"             : "Int32",
        "Puls_Glucose"                : "Int32",
        "Puls_Medium"                 : "Int32",
        "StirringSpeed"               : "Int32",
        "pH"                          : "Float32",
        "Flow_Nitrogen"               : "Int32",
        "Flow_O2"                     : "Int32",
        "Feed_dextrine_cum_setpoints" : "Int32",
        "Temperature"                 : "Int32",
    },
    "metadata_features" : {
        "unit"  : "string[pyarrow]",
        "scale" : "string[pyarrow]",
        "lower" : "Float32",
        "upper" : "Float32",
    },
    "timeseries_features" : {
        "unit"  : "string[pyarrow]",
        "scale" : "string[pyarrow]",
        "lower" : "Float32",
        "upper" : "Float32",
    },
    "setpoints_features" : {
        "unit"  : "string[pyarrow]",
        "scale" : "string[pyarrow]",
        "lower" : "Float32",
        "upper" : "Float32",
    },
}
# fmt: on

# fmt: off
selected_columns = {
    "metadata" : {
        # "experiment_id"          : True,
        "bioreactor_id"          : True,
        "container_number"       : True,
        "profile_id"             : False,
        "description_x"          : False,
        "color"                  : True,
        "profile_name"           : True,
        "plasmid_id"             : True,
        # "run_id"                 : True,
        "Feed_concentration_glc" : True,
        "OD_Dilution"            : True,
        "pH_correction_factor"   : True,
        "ph_Tolerance"           : True,
        "run_name"               : False,
        "folder_id_y"            : False,
        "start_time"             : False,
        "end_time"               : False,
        "description"            : False,
        "μ_set"                  : True,
        "IPTG"                   : True,
    },
    "setpoints": {
        "cultivation_age"             : False,
        "setpoint_id"                 : False,
        "unit"                        : False,
        "Feed_glc_cum_setpoints"      : True,
        "Flow_Air"                    : True,
        "InducerConcentration"        : True,
        "Puls_AceticAcid"             : False,
        "Puls_Glucose"                : False,
        "Puls_Medium"                 : False,
        "StirringSpeed"               : True,
        "pH"                          : True,
        "Flow_Nitrogen"               : False,
        "Flow_O2"                     : False,
        "Feed_dextrine_cum_setpoints" : False,
        "Temperature"                 : False,
    },
    "timeseries" :  {
        "unit"                          : False,
        "Flow_Air"                      : True,
        "Flow_Nitrogen"                 : False,
        "Flow_O2"                       : False,
        "StirringSpeed"                 : True,
        "Temperature"                   : True,
        "Acetate"                       : True,
        "Base"                          : True,
        "Cumulated_feed_volume_glucose" : True,
        "Cumulated_feed_volume_medium"  : True,
        "DOT"                           : True,
        "Fluo_GFP"                      : True,
        "Glucose"                       : True,
        "OD600"                         : True,
        "Probe_Volume"                  : True,
        "pH"                            : True,
        "InducerConcentration"          : True,
        "remark"                        : False,
        "Volume"                        : True,
        "Acid"                          : False,
    }
}
# fmt: on

# fmt: off
metadata_features = {
    # column                   [unit, scale, lower bound, upper bound]
    "bioreactor_id"          : [pd.NA, "category", pd.NA, pd.NA],
    "container_number"       : [pd.NA, "category", pd.NA, pd.NA],
    "color"                  : [pd.NA, "category", pd.NA, pd.NA],
    "profile_name"           : [pd.NA, "category", pd.NA, pd.NA],
    "plasmid_id"             : [pd.NA, "category", pd.NA, pd.NA],
    "Feed_concentration_glc" : ["g/L", "absolute", pd.NA, pd.NA],
    "OD_Dilution"            : ["%",   "percent",  0,     100  ],
    "pH_correction_factor"   : [pd.NA, "factor",   0,     None ],
    "ph_Tolerance"           : [pd.NA, "linear",   0,     None ],
    "μ_set"                  : ["%",   "percent",  0,     100  ],
    "IPTG"                   : ["mM",  "absolute", 0,     None ],
}
# fmt: on

# fmt: off
timeseries_features = {
    "Acetate"                       : ["%",      "percent",   0,   2.5      ],
    "Base"                          : ["uL",     "absolute",  0,   None     ],
    "Cumulated_feed_volume_glucose" : ["uL",     "absolute",  0,   None     ],
    "Cumulated_feed_volume_medium"  : ["uL",     "absolute",  0,   None     ],
    "DOT"                           : ["%",      "percent",   0,   100      ],
    "Flow_Air"                      : ["Ln/min", "absolute",  0,   None     ],
    "Fluo_GFP"                      : ["RFU",    "absolute",  0,   1_000_000],
    "Glucose"                       : ["g/L",    "absolute",  0,   20       ],
    "InducerConcentration"          : ["mM",     "absolute",  0,   None     ],
    "OD600"                         : ["%",      "percent",   0,   100      ],
    "Probe_Volume"                  : ["uL",     "absolute",  0,   None     ],
    "StirringSpeed"                 : ["U/min",  "absolute",  0,   None     ],
    "Temperature"                   : ["°C",     "linear",    20,  45       ],
    "Volume"                        : ["mL",     "absolute",  0,   None     ],
    "pH"                            : ["pH",     "linear",    4,   10       ],
}
# fmt: on

In [None]:
def float_is_int(series: Series) -> bool:
    r"""Check if all float values are integers."""
    mask = pd.notna(series)
    return series[mask].apply(float.is_integer).all()


def get_integer_cols(table: DataFrame) -> set[str]:
    r"""Get all columns that contain only integers."""
    cols = set()
    for col in table:
        if np.issubdtype(table[col].dtype, np.integer):
            # __logger__.debug("Integer column                       : %s", col)
            cols.add(col)
        elif np.issubdtype(table[col].dtype, np.floating) and float_is_int(table[col]):
            # __logger__.debug("Integer column pretending to be float: %s", col)
            cols.add(col)
    return cols

In [None]:
ds = tsdm.datasets.KIWI_RUNS(initialize=False)

In [None]:
with open(ds.rawdata_paths, "rb") as file:
    ds.LOGGER.info("Loading raw data from %s", ds.rawdata_paths)
    data = pickle.load(file)

In [None]:
data.keys()

In [None]:
data[439].keys()

In [None]:
data[439][15325].keys()

In [None]:
data[439][15325]["metadata"]

# Metadata

## Implementation

In [None]:
metadata = {
    (outer_key, inner_key): tables["metadata"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}
metadata = pd.concat(metadata, names=["run_id", "exp_id"])

# generate μ_set column
mu_sets = metadata["description_x"].str.split(" ", expand=True)
mu_sets = mu_sets.astype("string[pyarrow]")
mu_sets.columns = ["name", "percent", "amount", "unit", "chemical"]
mu_sets["percent"] = mu_sets["percent"].str.split("%", expand=True)[0]
metadata["μ_set"] = mu_sets["percent"]
metadata["IPTG"] = mu_sets["amount"]

# cast dtypes
metadata = metadata.astype(column_dtypes["metadata"])

# validate units of μ-set
mask = mu_sets["amount"].notna()
mu_set_unit = list(mu_sets["unit"].loc[mask].unique())
assert len(mu_set_unit) == 1
assert mu_set_unit[0] == "mM"

tmin = metadata["start_time"]
tmax = metadata["end_time"]
# tmin.to_parquet(self.dataset_paths["tmin"])
# tmax.to_parquet(self.dataset_paths["tmax"])

columns = [key for key, val in selected_columns["metadata"].items() if val]
metadata = metadata[columns]
# metadata.to_parquet(self.dataset_paths["metadata"])

# metadata_features
assert set(metadata_features) == set(metadata)
metadata_features = DataFrame.from_dict(
    metadata_features,
    orient="index",
    columns=column_dtypes["metadata_features"],
)
metadata_features = metadata_features.astype(column_dtypes["metadata_features"])
# metadata_features.to_parquet(self.dataset_paths["metadata_features"])

## Manual

In [None]:
metadata = {
    (outer_key, inner_key): tables["metadata"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

metadata = pd.concat(metadata, names=["run_id", "exp_id"])
metadata = metadata.drop(columns=["run_id", "experiment_id"])

mu_sets = metadata["description_x"].str.split(" ", expand=True)
mu_sets = mu_sets.astype("string[pyarrow]")
mu_sets.columns = ["name", "percent", "amount", "unit", "chemical"]
mu_sets["percent"] = mu_sets["percent"].str.split("%", expand=True)[0]
metadata["μ_set"] = mu_sets["percent"]
metadata["IPTG"] = mu_sets["amount"]

In [None]:
metadata = metadata.astype(column_dtypes["metadata"])
tmin = metadata["start_time"]
tmax = metadata["end_time"]
data_overview(metadata)

In [None]:
units = {}
mask = mu_sets["amount"].notna()
mu_set_unit = list(mu_sets["unit"].loc[mask].unique())
assert len(mu_set_unit) == 1
units["IPTG"] = mu_set_unit[0]
units["μ_set"] = "%"

In [None]:
columns = [key for key, val in selected_columns["metadata"].items() if val]
metadata = metadata[columns]
data_overview(metadata)

## Metadata Features

In [None]:
metadata_features = DataFrame.from_dict(
    metadata_features, orient="index", columns=column_dtypes["metadata_features"]
)
metadata_features = metadata_features.astype(column_dtypes["metadata_features"])

In [None]:
# fmt: off
units = {
    "bioreactor_id"          : pd.NA,
    "container_number"       : pd.NA,
    "color"                  : pd.NA,
    "profile_name"           : pd.NA,
    "plasmid_id"             : pd.NA,
    "Feed_concentration_glc" : "g/L",
    "OD_Dilution"            : "%",
    "pH_correction_factor"   : pd.NA,
    "ph_Tolerance"           : pd.NA,
    "μ_set"                  : "%",
    "IPTG"                   : "mM",
}

bounds = {}
# fmt: on

# Setpoints

In [None]:
setpoints = {
    (outer_key, inner_key): tables["setpoints"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

setpoints = pd.concat(setpoints, names=["run_id", "exp_id"], verify_integrity=True)
setpoints = setpoints.reset_index(-1, drop=True)

setpoints = setpoints.astype(column_dtypes["setpoints"])
setpoints["unit"] = setpoints["unit"].replace("-", pd.NA)
setpoints = setpoints.set_index("cultivation_age", append=True)

### validate units - make sure units are always measured in the same quantity

In [None]:
values = [
    "Feed_glc_cum_setpoints",
    "Flow_Air",
    "InducerConcentration",
    "Puls_AceticAcid",
    "Puls_Glucose",
    "Puls_Medium",
    "StirringSpeed",
    "pH",
    "Flow_Nitrogen",
    "Flow_O2",
    "Feed_dextrine_cum_setpoints",
    "Temperature",
]
assert all(setpoints[values].notna().sum(axis=1) == 1), "multiple measurements!"

### use the column of the notna values to look up unit values

In [None]:
units = {}
for col in values:
    mask = setpoints[col].notna()
    units[col] = list(setpoints.loc[mask, "unit"].unique())
    assert len(units[col]) == 1, f"Multiple different units in {col}!"

units = Series({k: v[0] for k, v in units.items()}, dtype="string[pyarrow]")
units["pH"] = "pH"
units

In [None]:
columns = [key for key, val in selected_columns["setpoints"].items() if val]
setpoints = setpoints[columns]

## Setpoint Features

In [None]:
setpoints_features = DataFrame.from_dict(
    setpoints_features, orient="index", columns=column_dtypes["setpoints_features"]
)
setpoints_features = setpoints_features.astype(column_dtypes["setpoints_features"])

# Timeseries

In [None]:
measurements_aggregated = {
    (outer_key, inner_key): tables["measurements_aggregated"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_aggregated = pd.concat(
    measurements_aggregated, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_aggregated = measurements_aggregated.reset_index(-1, drop=True)
measurements_aggregated = measurements_aggregated.set_index(
    "measurement_time", append=True
)

In [None]:
measurements_aggregated = measurements_aggregated.astype(column_dtypes["timeseries"])
measurements_aggregated["unit"].replace("-", pd.NA, inplace=True)
data_overview(measurements_aggregated, index_col="measurement_time")

## Fix units

In [None]:
# use the column of the notna values to look up unit values
values = measurements_aggregated["unit"]
units = {}
for col in measurements_aggregated.columns.drop("unit"):
    mask = measurements_aggregated[col].notna()
    units[col] = list(values.loc[mask].unique())
    assert len(units[col]) == 1, f"Multiple different units in {col}!"

units = Series({k: v[0] for k, v in units.items()}, dtype="string[pyarrow]")
units[["Acetate", "OD600", "DOT", "pH"]] = ["%", "%", "%", "pH"]
units

In [None]:
uniques_per_run_id = measurements_aggregated.groupby("run_id").nunique()
(uniques_per_run_id > 1).sum() > 1

In [None]:
# fmt: off
columns = [key for key, val in selected_columns["timeseries"].items() if val]
# fmt: on
measurements_aggregated = measurements_aggregated[columns]

## Aggregate overlapping measurements excluding duplicates

In [None]:
measurements_aggregated.groupby(["run_id", "exp_id", "measurement_time"]).mean()

In [None]:
uniques_per_timestamp = measurements_aggregated.groupby(
    ["run_id", "exp_id", "measurement_time"]
).nunique()
mask = (uniques_per_timestamp <= 1).all(axis="columns")

uniques_per_timestamp = measurements_aggregated.groupby(
    ["run_id", "exp_id", "measurement_time"]
).nunique()
measurements_aggregated.loc[(439, 15325, "2020-12-09 09:48:38")]

In [None]:
df = (
    measurements_aggregated.loc[mask]
    .groupby(["run_id", "exp_id", "measurement_time"])
    .mean()
)
pd.concat([df, measurements_aggregated.loc[~mask]]).sort_index()

In [None]:
columns = [key for key, val in selected_columns["timeseries"].items() if val]
units[columns]

## Timeseries Features

In [None]:
timeseries_features = DataFrame.from_dict(
    timeseries_features, orient="index", columns=column_dtypes["timeseries_features"]
)
timeseries_features = timeseries_features.astype(column_dtypes["timeseries_features"])

# Measurements Reactor

In [None]:
measurements_reactor = {
    (outer_key, inner_key): tables["measurements_reactor"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_reactor = pd.concat(
    measurements_reactor, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_reactor = measurements_reactor.reset_index(-1, drop=True)

# Measurements Array

In [None]:
measurements_array = {
    (outer_key, inner_key): tables["measurements_array"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_array = pd.concat(
    measurements_array, names=["run_id", "exp_id"], verify_integrity=True
)
measurements_array = measurements_array.reset_index(-1, drop=True)