# Improved Dataset Pipeline

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pickle

import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import tsdm
from tsdm.random.stats import data_overview

In [None]:
# fmt: off
column_dtypes = {
    "metadata": {
        # "experiment_id"          : "Int32",
        "bioreactor_id"          : "Int32",
        "container_number"       : "Int32",
        "profile_id"             : "Int32",
        "starter_culture_id"     : "Int32",
        "inactivation_method_id" : "Int32",
        "description_x"          : "string[pyarrow]",
        "color"                  : "string[pyarrow]",
        "profile_name"           : "string[pyarrow]",
        "folder_id_x"            : "Int32",
        "organism_id"            : "Int32",
        "plasmid_id"             : "Int32",
        "medium_id"              : "Int32",
        "description_y"          : "string[pyarrow]",
        # "run_id"                 : "Int32",
        "Acetate_Dilution"       : "Int32",
        "Feed_concentration_glc" : "Int32",
        "InducerConcentration"   : "Int32",
        "OD_Dilution"            : "Int32",
        "Stir_Max_Restarts"      : "Int32",
        "pH_correction_factor"   : "Float32",
        "ph_Base_conc"           : "Float32",
        "ph_Ki"                  : "Float32",
        "ph_Kp"                  : "Float32",
        "ph_Tolerance"           : "Float32",
        "run_name"               : "string[pyarrow]",
        "folder_id_y"            : "Int32",
        "pms_id"                 : "Int32",
        "status_id"              : "Int32",
        "start_time"             : "datetime64[ns]",
        "end_time"               : "datetime64[ns]",
        "description"            : "string[pyarrow]",
        "conclusion"             : "string[pyarrow]",
        "container_label"        : "string[pyarrow]",
        "is_template"            : "Int32",
        "Glucose_Dilution"       : "Int32",
        "ph_Acid_conc"           : "Int32",
        # added columns
        "μ_set"                  : "Int32",
        "IPTG"                   : "Float32",
    },
    "timeseries": {
        # "measurement_time"              : "datetime64[ns]",
        "unit"                            : "string[pyarrow]",
        "Flow_Air"                        : "Int32",
        "Flow_Nitrogen"                   : "Int32",
        "Flow_O2"                         : "Int32",
        "StirringSpeed"                   : "Int32",
        "Temperature"                     : "Float32",
        "Acetate"                         : "Float32",
        "Base"                            : "Int32",
        "Cumulated_feed_volume_glucose"   : "Int32",
        "Cumulated_feed_volume_medium"    : "Float32",
        "DOT"                             : "Float32",
        "Fluo_GFP"                        : "Float32",
        "Glucose"                         : "Float32",
        "OD600"                           : "Float32",
        "Probe_Volume"                    : "Int32",
        "pH"                              : "Float32",
        "InducerConcentration"            : "Float32",
        "remark"                          : "string",
        "Volume"                          : "Float32",
        "Acid"                            : "Int32",
    },
    "setpoints" : {
        "cultivation_age"             : "Int32",
        "setpoint_id"                 : "Int32",
        "unit"                        : "string[pyarrow]",
        "Feed_glc_cum_setpoints"      : "Int32",
        "Flow_Air"                    : "Int32",
        "InducerConcentration"        : "Float32",
        "Puls_AceticAcid"             : "Int32",
        "Puls_Glucose"                : "Int32",
        "Puls_Medium"                 : "Int32",
        "StirringSpeed"               : "Int32",
        "pH"                          : "Float32",
        "Flow_Nitrogen"               : "Int32",
        "Flow_O2"                     : "Int32",
        "Feed_dextrine_cum_setpoints" : "Int32",
        "Temperature"                 : "Int32",
    },
    "metadata_features" : {
        "unit"  : "string[pyarrow]",
        "scale" : "string[pyarrow]",
        "lower" : "Float32",
        "upper" : "Float32",
    },
    "timeseries_features" : {
        "unit"  : "string[pyarrow]",
        "scale" : "string[pyarrow]",
        "lower" : "Float32",
        "upper" : "Float32",
    },
    "setpoints_features" : {
        "unit"  : "string[pyarrow]",
        "scale" : "string[pyarrow]",
        "lower" : "Float32",
        "upper" : "Float32",
    },
}
# fmt: on

# fmt: off
selected_columns = {
    "metadata" : {
        # "experiment_id"          : True,
        "bioreactor_id"          : True,
        "container_number"       : True,
        "profile_id"             : False,
        "description_x"          : False,
        "color"                  : True,
        "profile_name"           : True,
        "plasmid_id"             : True,
        # "run_id"                 : True,
        "Feed_concentration_glc" : True,
        "OD_Dilution"            : True,
        "pH_correction_factor"   : True,
        "ph_Tolerance"           : True,
        "run_name"               : False,
        "folder_id_y"            : False,
        "start_time"             : False,
        "end_time"               : False,
        "description"            : False,
        "μ_set"                  : True,
        "IPTG"                   : True,
    },
    "setpoints": {
        "cultivation_age"             : False,
        "setpoint_id"                 : False,
        "unit"                        : False,
        "Feed_glc_cum_setpoints"      : True,
        "Flow_Air"                    : True,
        "InducerConcentration"        : True,
        "Puls_AceticAcid"             : False,
        "Puls_Glucose"                : False,
        "Puls_Medium"                 : False,
        "StirringSpeed"               : True,
        "pH"                          : True,
        "Flow_Nitrogen"               : False,
        "Flow_O2"                     : False,
        "Feed_dextrine_cum_setpoints" : False,
        "Temperature"                 : False,
    },
    "timeseries" :  {
        "unit"                          : False,
        "Flow_Air"                      : True,
        "Flow_Nitrogen"                 : False,
        "Flow_O2"                       : False,
        "StirringSpeed"                 : True,
        "Temperature"                   : True,
        "Acetate"                       : True,
        "Base"                          : True,
        "Cumulated_feed_volume_glucose" : True,
        "Cumulated_feed_volume_medium"  : True,
        "DOT"                           : True,
        "Fluo_GFP"                      : True,
        "Glucose"                       : True,
        "OD600"                         : True,
        "Probe_Volume"                  : True,
        "pH"                            : True,
        "InducerConcentration"          : True,
        "remark"                        : False,
        "Volume"                        : False,
        "Acid"                          : False,
    }
}
# fmt: on

In [None]:
def float_is_int(series: Series) -> bool:
    r"""Check if all float values are integers."""
    mask = pd.notna(series)
    return series[mask].apply(float.is_integer).all()


def get_integer_cols(table: DataFrame) -> set[str]:
    r"""Get all columns that contain only integers."""
    cols = set()
    for col in table:
        if np.issubdtype(table[col].dtype, np.integer):
            # __logger__.debug("Integer column                       : %s", col)
            cols.add(col)
        elif np.issubdtype(table[col].dtype, np.floating) and float_is_int(table[col]):
            # __logger__.debug("Integer column pretending to be float: %s", col)
            cols.add(col)
    return cols

In [None]:
ds = tsdm.datasets.KIWI_RUNS(initialize=False)

In [None]:
with open(ds.rawdata_paths, "rb") as file:
    ds.LOGGER.info("Loading raw data from %s", ds.rawdata_paths)
    data = pickle.load(file)

In [None]:
data.keys()

In [None]:
data[439].keys()

In [None]:
data[439][15325].keys()

In [None]:
data[439][15325]["metadata"]

# Metadata

## Generate DataFrame

In [None]:
metadata = {
    (outer_key, inner_key): tables["metadata"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

metadata = pd.concat(metadata, names=["run_id", "experiment_id"])
metadata = metadata.reset_index(-1, drop=True)
metadata = metadata.drop(columns=["run_id", "experiment_id"])

### Generate μ-set columns

In [None]:
mu_sets = metadata["description_x"].str.split(" ", expand=True)
mu_sets = mu_sets.astype("string[pyarrow]")
mu_sets.columns = ["name", "percent", "amount", "unit", "chemical"]
mu_sets["percent"] = mu_sets["percent"].str.split("%", expand=True)[0]
metadata["μ_set"] = mu_sets["percent"]
metadata["IPTG"] = mu_sets["amount"]

### fix datatypes

In [None]:
metadata = metadata.astype(column_dtypes["metadata"])
data_overview(metadata)

### Select columns

In [None]:
tmin = metadata["start_time"]
tmax = metadata["end_time"]
columns = [key for key, val in selected_columns["metadata"].items() if val]
metadata = metadata[columns]

## Metadata Features

In [None]:
units = {}
mask = mu_sets["amount"].notna()
mu_set_unit = list(mu_sets["unit"].loc[mask].unique())
assert len(mu_set_unit) == 1
units["IPTG"] = mu_set_unit[0]
units["μ_set"] = "%"
units

In [None]:
# fmt: off
metadata_features = {
    # column                   [unit, scale, lower bound, upper bound]
    "bioreactor_id"          : [pd.NA, "category", pd.NA, pd.NA ],
    "container_number"       : [pd.NA, "category", pd.NA, pd.NA ],
    "color"                  : [pd.NA, "category", pd.NA, pd.NA ],
    "profile_name"           : [pd.NA, "category", pd.NA, pd.NA ],
    "plasmid_id"             : [pd.NA, "category", pd.NA, pd.NA ],
    "Feed_concentration_glc" : ["g/L", "absolute", pd.NA, pd.NA ],
    "OD_Dilution"            : ["%",   "percent",  0,     100   ],
    "pH_correction_factor"   : [pd.NA, "factor",   0,     np.inf],
    "ph_Tolerance"           : [pd.NA, "linear",   0,     np.inf],
    "μ_set"                  : ["%",   "percent",  0,     100   ],
    "IPTG"                   : ["mM",  "absolute", 0,     np.inf],
}
# fmt: on

metadata_features = DataFrame.from_dict(
    metadata_features, orient="index", columns=column_dtypes["metadata_features"]
)
metadata_features = metadata_features.astype(column_dtypes["metadata_features"])

## Remove values out of bounds

In [None]:
for col in metadata:
    lower = metadata_features.loc[col, "lower"]
    upper = metadata_features.loc[col, "upper"]
    value = metadata[col]
    mask = (lower > value) | (value > upper)
    if mask.any():
        print(f"Removing {mask.mean():.2%} of data that does not match {col} bounds")
        metadata.loc[mask, col] = pd.NA

## Finalize Tables

In [None]:
metadata = metadata.dropna(how="all")
metadata.to_parquet("metadata.parquet")
metadata_features.to_parquet("metadata_features.parquet")

# Timeseries

## Generate DataFrame

In [None]:
timeseries = {
    (outer_key, inner_key): tables["measurements_aggregated"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

timeseries = pd.concat(
    timeseries, names=["run_id", "experiment_id"], verify_integrity=True
)
timeseries = timeseries.reset_index(-1, drop=True)
timeseries = timeseries.set_index("measurement_time", append=True)


# fix data types
timeseries = timeseries.astype(column_dtypes["timeseries"])

# replace spurious na values
timeseries["unit"].replace("-", pd.NA, inplace=True)
data_overview(timeseries, index_col="measurement_time")

## Select columns

In [None]:
timeseries_units = timeseries["unit"]
timeseries = timeseries.drop(columns=["unit"])
# remove non-informative columns
# columns with single value carry no information
mask = timeseries.nunique() > 1
# only keep columns that appear in at least half of the runs
mask &= (timeseries.groupby("run_id").nunique() > 0).mean() > 0.5
timeseries = timeseries[timeseries.columns[mask]]

## Validate units

In [None]:
assert all(timeseries.notna().sum(axis=1) <= 1), "multiple measurements!"

units = {}
for col in timeseries:
    mask = timeseries[col].notna()
    units[col] = list(timeseries_units.loc[mask].unique())
    assert len(units[col]) == 1, f"Multiple different units in {col}!"

units = Series({k: v[0] for k, v in units.items()}, dtype="string[pyarrow]")
units[["Acetate", "OD600", "DOT", "pH"]] = ["%", "%", "%", "pH"]
units

In [None]:
# Check data is non-trivial
uniques_per_run_id = timeseries.groupby("run_id").nunique()
assert ((uniques_per_run_id > 1).sum() > 1).all()

## Select Columns

In [None]:
columns = [key for key, val in selected_columns["timeseries"].items() if val]
timeseries = timeseries[columns]

## Timeseries Features

In [None]:
data_overview(timeseries)

In [None]:
# fmt: off
timeseries_features = {
    "Acetate"                       : ["%",      "percent",   0,   100      ],
    "Base"                          : ["uL",     "absolute",  0,   np.inf   ],
    "Cumulated_feed_volume_glucose" : ["uL",     "absolute",  0,   np.inf   ],
    "Cumulated_feed_volume_medium"  : ["uL",     "absolute",  0,   np.inf   ],
    "DOT"                           : ["%",      "percent",   0,   100      ],
    "Flow_Air"                      : ["Ln/min", "absolute",  0,   np.inf   ],
    "Fluo_GFP"                      : ["RFU",    "absolute",  0,   1_000_000],
    "Glucose"                       : ["g/L",    "absolute",  0,   20       ],
    "InducerConcentration"          : ["mM",     "absolute",  0,   np.inf   ],
    "OD600"                         : ["%",      "percent",   0,   100      ],
    "Probe_Volume"                  : ["uL",     "absolute",  0,   np.inf   ],
    "StirringSpeed"                 : ["U/min",  "absolute",  0,   np.inf   ],
    "Temperature"                   : ["°C",     "linear",    20,  45       ],
    "Volume"                        : ["mL",     "absolute",  0,   np.inf   ],
    "pH"                            : ["pH",     "linear",    4,   10       ],
}
# fmt: on

timeseries_features = DataFrame.from_dict(
    timeseries_features,
    orient="index",
    columns=column_dtypes["timeseries_features"],
)
timeseries_features = timeseries_features.astype(column_dtypes["timeseries_features"])

## Clean data

In [None]:
%matplotlib inline
fig = timeseries.hist(figsize=(24, 6), density=True, log=True, bins=20, layout=(3, 5));

## Remove values out of bounds

In [None]:
for col in timeseries:
    lower = timeseries_features.loc[col, "lower"]
    upper = timeseries_features.loc[col, "upper"]
    value = timeseries[col]
    mask = (lower > value) | (value > upper)
    if mask.any():
        print(f"Removing {mask.mean():.2%} of data that does not match {col} bounds")
        timeseries.loc[mask, col] = pd.NA

In [None]:
%matplotlib inline
fig = timeseries.hist(figsize=(24, 6), density=True, log=True, bins=20, layout=(3, 5));

## Remove data outside of time bounds

In [None]:
ts = timeseries.reset_index("measurement_time")
ts = ts.join([tmin, tmax])
cond = (ts["start_time"] <= ts["measurement_time"]) & (
    ts["measurement_time"] <= ts["end_time"]
)
print(f"Removing {(~cond).mean():.2%} of data that does not match tmin/tmax")
ts = ts[cond]
ts["measurement_time"] = ts["measurement_time"] - ts["start_time"]
ts = ts.set_index("measurement_time", append=True)
timeseries = ts[timeseries.columns]

## KIWI_RUNS - The booklet

In [None]:
def make_all_plots(key, ts):
    ts = ts.astype("float32")
    T = ((ts.index - ts.index[0]) / Timedelta("1h")).values
    fig, axes = plt.subplots(
        nrows=5, ncols=3, figsize=(10, 14), constrained_layout=True, sharex=True
    )
    for col, ax in zip(ts.columns, axes.flatten()):
        vals = ts[col]
        mask = pandas.notna(vals)
        ax.plot(
            T[mask],
            vals[mask],
            ls="-",
            lw=0.5,
            marker=".",
            ms=3,
        )
        ax.set_title(snake2camel(col))

        ymin, ymax = overview["min"][col], overview["max"][col]
        ypad = (ymax - ymin) / 20.0
        ax.set_ylim(ymin - ypad, ymax + ypad)
        xmin, xmax = 0, (tmax - tmin).max() / np.timedelta64(1, "h")
        xpad = (xmax - xmin) / 20.0
        ax.set_xlim(xmin - xpad, xmax + xpad)
    fig.suptitle(f"Run {key[0]} -- Experiment {key[1]}")
    return fig

In [None]:
import matplotlib.pyplot as plt
import pandas
from matplotlib.backends.backend_pdf import PdfPages
from pandas import DataFrame, Series, Timedelta
from tqdm.auto import tqdm

from tsdm.utils.strings import snake2camel

overview = data_overview(timeseries)
plt.style.use("bmh")
plt.rcParams["axes.axisbelow"] = True
ts = timeseries.copy()
ts = ts[sorted(ts.columns, key=snake2camel)]
key = 439, 15325
ts = ts.loc[key]

fig = make_all_plots(key, ts);

In [None]:
%matplotlib agg

ts = timeseries.copy()
ts = ts[sorted(ts.columns, key=snake2camel)]

with PdfPages("kiwi-runs-booklet.pdf") as pdf:
    groups = ts.groupby(["run_id", "experiment_id"])

    for key, slc in tqdm(groups):
        slc = slc.reset_index(["run_id", "experiment_id"], drop=True)
        fig = make_all_plots(key, slc)
        pdf.savefig(fig)
        plt.close(fig)
        break

## Aggregate Measurements (non-desctructive)

In [None]:
# total number of rows expected
total_rows = (
    timeseries.notna()
    .groupby(["run_id", "experiment_id", "measurement_time"])
    .sum()
    .max(axis=1)
    .sum()
)

In [None]:
# https://stackoverflow.com/questions/74115705
# TODO: is there a way to do it without stacking?
ts = timeseries.stack().to_frame(name="val")
counts = ts.groupby(level=[0, 1, 2, 3]).cumcount()
timeseries = (
    ts.set_index(counts, append=True)
    .loc[:, "val"]
    .unstack(level=3)
    .reindex(timeseries.columns, axis=1)
    .reset_index(level=3, drop=True)
    .astype(timeseries.dtypes)
)

## Finalize Tables

In [None]:
timeseries = timeseries.dropna(how="all")
timeseries = timeseries.sort_values(["run_id", "experiment_id", "measurement_time"])
timeseries.to_parquet("timeseries.parquet")
timeseries_features.to_parquet("timeseries_features.parquet")

# Setpoints

In [None]:
setpoints = {
    (outer_key, inner_key): tables["setpoints"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

setpoints = pd.concat(
    setpoints, names=["run_id", "experiment_id"], verify_integrity=True
)
setpoints = setpoints.reset_index(-1, drop=True)

setpoints = setpoints.astype(column_dtypes["setpoints"])
setpoints["unit"] = setpoints["unit"].replace("-", pd.NA)
setpoints = setpoints.set_index("cultivation_age", append=True)

## Select columns

In [None]:
setpoints_units = setpoints["unit"]
setpoints = setpoints.drop(columns=["setpoint_id", "unit"])
display(data_overview(setpoints))

# remove non-informative columns
mask = setpoints.nunique() > 1
setpoints = setpoints[setpoints.columns[mask]]

## Validate units

In [None]:
assert all(setpoints.notna().sum(axis=1) <= 1), "multiple measurements!"

units = {}
for col in setpoints:
    mask = setpoints[col].notna()
    units[col] = list(setpoints_units.loc[mask].unique())
    assert len(units[col]) == 1, f"Multiple different units in {col}!"

units = Series({k: v[0] for k, v in units.items()}, dtype="string[pyarrow]")
units["pH"] = "pH"
units

## Setpoint Features

In [None]:
data_overview(setpoints)

In [None]:
# fmt: off
setpoints_features = {
    "Feed_glc_cum_setpoints" : ["µL",     "absolute", 0,    None],
    "Flow_Air"               : ["Ln/min", "absolute", 0,    10.0],
    "InducerConcentration"   : ["mM",     "absolute", 0,    2.0 ],
    "StirringSpeed"          : ["U/min",  "absolute", 2000, 3000],
    "pH"                     : ["pH",     "linear",   6.0,  7.5 ],
}
# fmt: on
setpoints_features = DataFrame.from_dict(
    setpoints_features,
    orient="index",
    columns=column_dtypes["setpoints_features"],
)
setpoints_features = setpoints_features.astype(column_dtypes["setpoints_features"])

## Remove values out of bounds

In [None]:
for col in setpoints:
    lower = setpoints_features.loc[col, "lower"]
    upper = setpoints_features.loc[col, "upper"]
    value = setpoints[col]
    mask = (lower > value) | (value > upper)
    if mask.any():
        print(f"Removing {mask.mean():.2%} of data that does not match {col} bounds")
        setpoints.loc[mask, col] = pd.NA

## Finalize Tables

In [None]:
setpoints = setpoints.dropna(how="all")
setpoints.to_parquet("setpoints.parquet")
setpoints_features.to_parquet("setpoints_features.parquet")

# Measurements Reactor

In [None]:
measurements_reactor = {
    (outer_key, inner_key): tables["measurements_reactor"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_reactor = pd.concat(
    measurements_reactor, names=["run_id", "experiment_id"], verify_integrity=True
)
measurements_reactor = measurements_reactor.reset_index(-1, drop=True)

# Measurements Array

In [None]:
measurements_array = {
    (outer_key, inner_key): tables["measurements_array"]
    for outer_key, experiment in data.items()
    for inner_key, tables in experiment.items()
}

measurements_array = pd.concat(
    measurements_array, names=["run_id", "experiment_id"], verify_integrity=True
)
measurements_array = measurements_array.reset_index(-1, drop=True)

In [None]:
index = timeseries.index.to_frame().reset_index(drop=True)

In [None]:
timeseries.index[0]

In [None]:
timeseries

In [None]:
timeseries.loc[index.loc[0]]

In [None]:
n = len(timeseries.columns)
m = len(timeseries.index)

In [None]:
from collections import Counter

In [None]:
x = ["eggs", "ham", "eggs"]

In [None]:
next(iter(timeseries.itertuples()))

In [None]:
mdict = Counter((hash(y) for y in x))

In [None]:
c = Counter(["eggs", "ham", "eggs"])

In [None]:
set(c.items())

In [None]:
dict(c)

In [None]:
pi = np.random.permutation(n)

In [None]:
elwise = pandas.util.hash_pandas_object(timeseries, index=True)

In [None]:
hash(tuple(elwise))

In [None]:
row_hash = pandas.util.hash_pandas_object(timeseries).reset_index(drop=True)

In [None]:
frozenset(row_hash.iteritems())

In [None]:
pandas.util.hash_pandas_object(timeseries).reset_index(drop=True).reset_index()

In [None]:
hash(frozenset(elwise.iloc[np.random.permutation(m)]))

In [None]:
hash(frozenset(elwise))

In [None]:
pandas.util.hash_pandas_object(timeseries, index=False)