# Importing KIWI dataset

### Data Validation

Format

```python
data: tuple[
    dict[
        'description': set[str],
        'variables': list[dict[str, str]],
        'time_format': str,
        'series': dict[int, 
            dict['generating_parameters': 
                 dict[
                     'qsmax': float,
                     'qm': float,
                     'qamax': float,
                     'Yem': float,
                     'Yxsof': float,
                     'Yxa': float,
                     'Yos': float,
                     'Yoa': float,
                     'Yas': float,
                     'Kia': float,
                     'Ks': float,
                     'Ko': float,
                     'Kap': float,
                     'Kis': float,
                     'Ksa': float,
                     'Pamax': float,
                     'F0': float,
                     'mu_set': float,
                     'C_feed': float,
                     'Kp': float,
                ]
            ],
        ],
    ], 
    dict[int, DataFrame[columns=['X', 'S', 'A', 'DOTm', 'V', 'pulse', 'kLa']]]
]
```

In [127]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [178]:
import pickle

import numpy as np
import pandas
import pandas as pd
from pandas import DataFrame, Series, Interval, Period, Timestamp, Timedelta

## Data Encoding

Problem: Columns are encoded in wrong data-types (e.g. categoricals as int or string, floats as string, ints as floats etc.)

Solution: Sequentially figure out data types

Data Type Hirarchy:
1. String-Like (`np.flexible`)
    - strings
    - bytes
2. TimeLike types
    - Timestamp (np.datetime64)
    - Timedelta (np.timedelta64)
3. Numerical (`np.number`)
   - floating (`np.floating`)
       - float
       - complex
   - integer (`np.integer`)
       - signed
       - unsigned
4. Boolean (np.bool_)
5. Pandas special types
    - CategoricalDtype
    - DatetimeTZDtype
    - PeriodDtype
    - IntervalDtype

End goal: whenever it is appropriate to do so, perform the following conversions:

1. Get appropriate Nullable Pandas type
2. Downcast int → uint
3. Downcast int64 → int32 → int16 → int8
3. Downcast float64 → float32
4. Downcast complex128 → complex64
5. Convert to categorical datatype

In [162]:
def subdtypes(dtype):
    subs = dtype.__subclasses__()
    if not subs:
        return dtype
    return [dtype, [subdtypes(dt) for dt in subs]]


subdtypes(np.generic)

In [172]:
type(pandas.Series([1.3]).dtype)

### Format


```python
dict[int, # run_id
    dict[int, # experiment_id
         dict[
             'metadata',: DataFrame,                # static
             'setpoints': DataFrame,                # static
             'measurements_reactor',: DataFrame,    # TimeTensor
             'measurements_array',: DataFrame,      # TimeTensor
             'measurements_aggregated': DataFrame,  # TimeTensor
         ]
    ]
]

```

In [2]:
with open("kiwi_experiments_and_run_355.pk", "rb") as file:
    data = pickle.load(file)

col_run_to_exp = create_replicate_dict(data)
splitter = ReplicateBasedSplitter()

In [3]:
DATA = [
    (data[run][exp] | {"run_id": run, "experiment_id": exp})
    for run in data
    for exp in data[run]
]
DF = DataFrame(DATA).set_index(["run_id", "experiment_id"])

## Raw-Data

In [None]:
from cross_validate_kiwi_runs import ReplicateBasedSplitter, create_replicate_dict

In [4]:
tables = {}

for key in (
    "metadata",
    "setpoints",
    "measurements_reactor",
    "measurements_array",
    "measurements_aggregated",
):
    if key == "metadata":
        tables[key] = pd.concat(iter(DF[key])).reset_index(drop=True)
    else:
        tables[key] = (
            pd.concat(iter(DF[key]), keys=DF[key].index)
            .reset_index(level=2, drop=True)
            .reset_index()
        )

tables

# Preprocessing

In [5]:
?np.number

In [416]:
metadata["OD_Dilution"]

In [420]:
BOOLS = [
    {"f": False, "t": True},
    {"false": False, "true": True},
    {"n": False, "y": True},
    {"no": False, "yes": True},
    {"-": False, "+": True},
    {0: False, 1: True},
    {-1: False, +1: True},
    {0.0: False, 1.0: True},
    {-1.0: False, +1.0: True},
]

In [435]:
x, y = np.random.randn(2, 1000)

In [436]:
x.dot(y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [428]:
%%timeit

Series(np.random.rand(10 ** 7))

In [429]:
%%timeit

Series(np.random.rand(10 ** 7)).unique()

In [425]:
Series([+1, -1, +1, -1]).astype("boolean")

In [422]:
any({0, 1} <= pair.keys() for pair in BOOLS)

In [417]:
pandas.api.types.is_string_dtype(metadata["OD_Dilution"])

In [410]:
Series([1, -1, 1]).astype("Int8").unique()

In [47]:
def contains_no_information(series) -> bool:
    return len(series.dropna().unique()) <= 1


def contains_nan_slice(series, slices, two_enough: bool = False) -> bool:
    num_missing = 0
    for idx in slices:
        if pd.isna(series[idx]).all():
            num_missing += 1

    if (num_missing > 0 and not two_enough) or (
        num_missing >= len(slices) - 1 and two_enough
    ):
        print(f"{series.name}: data missing in {num_missing}/{len(slices)} slices!")
        return True
    return False


def float_is_int(series) -> bool:
    mask = pd.notna(series)
    return series[mask].apply(float.is_integer).all()


def is_bool(vals) -> bool:
    if len(vals) > 2:
        return False

    if np.issubdtype(vals.dtype, np.bool_):
        print(f"Boolean column                       : {col}")
        return True
    elif np.issubdtype(vals.dtype, np.integer):
        # print(vals==0 ^ vals==1)
        if ((vals == 0) ^ (vals == 1)).all() or ((vals == -1) ^ (vals == 1)).all():
            print(f"Boolean column pretending to be integer: {col}")
            return Ture
    elif np.issubdtype(vals.dtype, np.floating):
        if ((vals == 0) ^ (vals == 1)).all() or ((vals == -1) ^ (vals == 1)).all():
            print(f"Boolean column pretending to be float: {col}")
            return True
    elif np.issubdtype(vals.dtype, pandas.StringDtype):
        val1, val2 = set(vals)
        val1 = str(val1).lower()
        val2 = str(val2).lower()
        if {val1, val2} in (
            {"0", "1"},
            {"-1", "+1"},
            {"-1", "1"},
            {"t", "f"},
            {"true", "false"},
            {"y", "n"},
            {"yes", "no"},
        ):
            print(f"Boolean column pretending to be string: {col}")
            return True
    return False


def get_true_column_dtypes(table) -> dict[str, str]:
    dtypes = {}
    for col in table:
        series = table[col]
        mask = pd.notna(series)
        vals = series[mask].unique()


def get_boolean_cols(df) -> set[str]:
    cols = set()
    for col in table:
        series = table[col]
        mask = pd.notna(series)
        vals = series[mask].unique()
        if is_bool(vals):
            cols.add(col)
    return cols


def get_integer_cols(df) -> set[str]:
    cols = set()
    for col in table:
        if np.issubdtype(table[col].dtype, np.integer):
            print(f"Integer column                       : {col}")
            cols.add(col)
        elif np.issubdtype(table[col].dtype, np.floating) and float_is_int(table[col]):
            print(f"Integer column pretending to be float: {col}")
            cols.add(col)
    return cols


def get_useless_cols(df, strict: bool = False) -> set[str]:
    useless_cols = set()
    for col in table:
        s = table[col]
        if col in ("run_id", "experiment_id"):
            continue
        if contains_no_information(s):
            print(f"No information in      {col}")
            useless_cols.add(col)
        elif contains_nan_slice(s, run_masks, two_enough=(not strict)):
            print(f"Missing for some run   {col}")
            useless_cols.add(col)
    return useless_cols


def get_μ_set(s: str):
    if pd.isna(s):
        return s
    s = s.strip().lstrip("µ_set: ").strip()
    percent, s = s.split(", ")
    value = s.strip().rstrip("mM IPTG").strip()
    return percent, value

In [148]:
np.datetime64

In [157]:
pandas.Series([pandas.Interval(0, 1)]).dtype == pandas.IntervalDtype()

In [152]:
np.issubdtype(pandas.Series([pandas.Interval(0, 1)]).dtype, pandas.IntervalDtype)

In [189]:
pandas.api.types.is_datetime64_dtype(Series(["2021"]).dtype)

In [193]:
pandas.api.types.is_datetime64_dtype(Series([Timestamp("2021")]).unique())

In [202]:
pandas.api.types.is_unsigned_integer_dtype(pandas.Series([1, 2, 3, 4, 5]))

In [211]:
isinstance(Series([1, 2, 3]).astype("Int64").dtype, pandas.Int64Dtype)

In [212]:
PANDAS_DTYPES = {
    pandas.BooleanDtype: "boolean",
    pandas.CategoricalDtype: "category",
    pandas.DatetimeTZDtype: "datetime64",  # datetime64[ns, <tz>]
    pandas.Float32Dtype: "Float32",
    pandas.Float64Dtype: "Float64",
    pandas.Int16Dtype: "Int16",
    pandas.Int32Dtype: "Int32",
    pandas.Int64Dtype: "Int64",
    pandas.Int8Dtype: "Int8",
    pandas.IntervalDtype: "interval",  # e.g. to denote ranges of variables
    pandas.PeriodDtype: "period",  # period[<freq>]
    pandas.SparseDtype: "Sparse",
    pandas.StringDtype: "string",
    pandas.UInt16Dtype: "UInt16",
    pandas.UInt32Dtype: "UInt32",
    pandas.UInt64Dtype: "UInt64",
    pandas.UInt8Dtype: "UInt8",
}
r"""Dictionary of all :mod:`pandas` data types."""

In [218]:
Series([(1, 2), (3, 4)]).astype("interval")

In [371]:
s = pandas.to_numeric(metadata["experiment_id"], downcast="unsigned")

In [376]:
pandas.to_numeric(s.append(Series([70000])), downcast="unsigned") - 70000

In [358]:
metadata["OD_Dilution"].unique()

In [393]:
Series([True, False, pd.NA]).astype("boolean")

In [346]:
s = pandas.Series([None, "2", "NaN"]).convert_dtypes()

In [348]:
mask = pandas.isna(s)

In [350]:
type(pd.NaT)

In [327]:
metadata.convert_dtypes().dtypes

In [320]:
Series(["123", "456"]).astype("string").unique()

In [325]:
for col in metadata:
    pandas.to_numeric(metadata[col])

In [None]:
def infer_dtype(series: Series):
    """Tries to infer data type    """
    mask = pandas.notna(series)
    values = series[mask]
    uniques = values.unqiue()
    dtype = series.dtype
    dtype_type = type(dtype)
    if type(dtype) in PANDAS_DTYPES:
        return dtype
    if 
    
    
    

In [308]:
from pandas.api.extensions import ExtensionDtype

In [258]:
pandas.api.types.is_

In [259]:
pandas.api.type.issubdtype

In [136]:
def infer_datatypes(table: DataFrame) -> dict(str, np.dtype):
    """This is a utility function that tries to infer the datatypes of the columns of a DataFrame.
    
    In particular, it tries to identify ill-specified columns:
    
    - columns that contains booleans  but are formatter as integers
    - columns that contains integers  but are formatted as floating
    - columns that contains floating  but are formatted as strings
    """
    
    dtypes = dict(table.dtypes)
    new_dtypes = dtypes.copy()
    
    
    for col, dtype in dtypes.items():
        series = table[col]
        # Leave pandas specific columns alone
        if type(dtype) in PANDAS_DTYPES:
            continue
        if pandas.api.types.is_datetime64_ns_dtype(dtype):
            continue
        if pandas.api.types.is_timedelta64_ns_dtype(dtype):
            continue

        if pandas.api.types.is_object_dtype(dtype):
            

            
        if dtype == np.datetime64
        if np.issubdtype(dtype, numpy.datetime64):
            continue
        
        
        if np.issubdtype(dtype, pandas.DatetimeTZDtype):
            continue
        if np.issubdtype(dtype, pandas.DatetimeTZDtype):
            continue

        if np.issubdtype(dtype, numpy.timedelta64):
            continue
        if np.issubdtype(dtype, numpy.timedelta64):
            continue
    

In [68]:
set("111,5156_44")

In [67]:
s = s.strip()  # remove leading and trailing whitespace

chars = set(s)

assert chars <= {".", ",", "_", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}

if 

s.replace("_", "")




### MetaData Preprocessing

In [54]:
np.array([1.0, 2.0]) == {1, 2}

In [48]:
get_boolean_cols(table)

In [62]:
pd.to_numeric("10_000", errors="coerce")

In [76]:
s = table["ph_Timesetpoint"]

In [82]:
s.unique()

In [84]:
mask = pd.notna(s)

In [87]:
pandas.to_numeric(s, downcast=True)

In [85]:
pd.api.types.is_numeric_dtype(s[mask])

In [196]:
for col in metadata:
    series = metadata[col]
    mask = pandas.notna(series)
    values = series[mask]
    print(pandas.api.types.infer_dtype(values))

In [195]:
metadata.dtypes

In [71]:
metadata.dtypes

In [65]:
float("10.000")

In [59]:
for x in table["ph_Ki"]:
    pd.to_numeric(x)

In [103]:
def is_numeric(series):
    mask = pd.notna(series)

    try:
        pd.to_numeric(series)
    except:
        return False
    else:
        return True

In [None]:
def get_unqiues(series, uniques = None, mask = None):
    
    

In [None]:
def is_categorical(series, ratio=0.01, uniques=None, mask=None):
    """Test whether a column could be categorical
    Test 1: if number of unique values < ratio*number of recorded values
    Test 2: if integer and unique values = range()
    """

In [126]:
table["OD_Dilution"].astype(pd.Int64Dtype())

In [None]:
def is_empty(series):
    return pd.isna(series).all()

In [118]:
for col in table:
    series = table[col]
    dtype = series.dtype
    if np.issubdtype(dtype, pandas.StringDtype):
        if is_numeric(series):
            print(f"Numerical column pretending to be string: {col=}")

In [88]:
table = metadata = tables["metadata"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

useless_cols = set()

for col in table:
    s = table[col]
    if col in ("run_id", "experiment_id"):
        continue
    if contains_no_information(s):
        print(f"No information in      {col}")
        useless_cols.add(col)
    elif contains_nan_slice(s, run_masks):
        print(f"Missing for some run   {col}")
        useless_cols.add(col)

# drop the following specific columns
useless_cols |= {
    "folder_id_y",
    "ph_Base_conc",
    "ph_Ki",
    "ph_Kp",
    "ph_Tolerance",
    "pms_id",
}

In [95]:
table["OD_Dilution"].unique()

In [101]:
float("100.0")

In [98]:
pd.isna(table["OD_Dilution"])

In [93]:
table["OD_Dilution"].astype("int32")

In [11]:
metadata_columns = {
    "bioreactor_id",
    "container_number",
    "experiment_id",
    "run_id",
    "profile_id",
    "color",
    "profile_name",
    "organism_id",
    "OD_Dilution",
    "run_name",
    "start_time",
    "end_time",
}

remaining_cols = set(metadata.columns) - useless_cols
assert (
    metadata_columns >= set(metadata.columns) - useless_cols
), f"Superfluous {metadata_columns - remaining_cols}"
assert (
    metadata_columns <= set(metadata.columns) - useless_cols
), f"You forgot to check {metadata_columns - remaining_cols}"


metadata_dtypes = {
    "experiment_id": "UInt32",
    "bioreactor_id": "UInt32",
    "container_number": "UInt32",
    "profile_id": "UInt32",
    "color": "string",
    "profile_name": "string",
    "organism_id": "UInt32",
    "run_id": "UInt32",
    "OD_Dilution": "float32",
    "run_name": "string",
    "start_time": "datetime64[ns]",
    "end_time": "datetime64[ns]",
}

assert metadata_columns >= set(
    metadata_dtypes
), f"Superfluous encoing  {set(metadata_dtypes) - metadata_columns}"
assert metadata_columns <= set(
    metadata_dtypes
), f"You forgot to encode {metadata_columns - set(metadata_dtypes)}"

metadata_categoricals = {
    "profile_name": "category",
    "run_name": "category",
    "color": "category",
    "OD_Dilution": "Float32",
}

assert metadata_columns >= set(
    metadata_categoricals
), f"Superfluous encoing {set(metadata_categoricals) - metadata_columns}"

In [12]:
metadata = metadata.astype(metadata_dtypes)
metadata = metadata.astype(metadata_categoricals)
metadata = metadata[metadata_columns]
metadata = metadata.set_index(["run_id", "experiment_id"])

## Setpoint Preprocessing

In [13]:
table = setpoints = tables["setpoints"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

useless_cols = set()

for col in table:
    s = table[col]
    if col in ("run_id", "experiment_id"):
        continue
    if contains_no_information(s):
        print(f"No information in      {col}")
        useless_cols.add(col)
    elif contains_nan_slice(s, run_masks, two_enough=True):
        print(f"Missing for some run   {col}")
        useless_cols.add(col)

# drop the following specific columns
useless_cols

In [14]:
setpoint_columns = set(setpoints.columns)
remaining_cols = setpoint_columns - useless_cols

selected_columns = {
    "experiment_id",
    "run_id",
    "cultivation_age",
    "setpoint_id",
    "unit",
    "Puls_Glucose",
    "StirringSpeed",
    "Feed_glc_cum_setpoints",
    "Flow_Air",
    "InducerConcentration",
}

assert (
    selected_columns >= setpoint_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns}"
assert (
    selected_columns <= setpoint_columns - useless_cols
), f"Superfluous {selected_columns - remaining_cols}"

setpoints_dtypes = {
    "experiment_id": "UInt32",
    "run_id": "UInt32",
    "cultivation_age": "UInt32",
    "setpoint_id": "UInt32",
    "unit": "string",
    "Puls_AceticAcid": "Float32",
    "Puls_Glucose": "Float32",
    "Puls_Medium": "Float32",
    "StirringSpeed": "UInt16",
    "pH": "Float32",
    "Feed_glc_cum_setpoints": "UInt16",
    "Flow_Air": "UInt8",
    "InducerConcentration": "Float32",
    "Flow_Nitrogen": "Float32",
    "Flow_O2": "Float32",
    "Feed_dextrine_cum_setpoints": "Float32",
}

assert (
    set(setpoints_dtypes) <= setpoint_columns
), f"Superfluous encoing  {set(setpoints_dtypes) - setpoint_columns}"
assert (
    set(setpoints_dtypes) >= selected_columns
), f"You forgot to encode {selected_columns - set(setpoints_dtypes)}"

setpoints_categoricals = {
    "unit": "category",
}

assert set(setpoints_categoricals) <= set(
    setpoints_dtypes
), f"Superfluous encoing {set(setpoints_categoricals) - set(setpoints_dtypes)}"

In [15]:
setpoints["unit"] = setpoints["unit"].replace(to_replace="-", value=pd.NA)
setpoints = setpoints.astype(setpoints_dtypes)
setpoints = setpoints.astype(setpoints_categoricals)
setpoints = setpoints[selected_columns]
setpoints = setpoints.set_index(["run_id", "experiment_id"])

##  Measurements Reactor

In [None]:
table = measurements_reactor = tables["measurements_reactor"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"] == run for run in runs]

useless_cols = get_useless_cols(table)
integer_cols = get_integer_cols(table)

In [None]:
table_columns = set(table.columns)
remaining_cols = table_columns - useless_cols

In [None]:
selected_columns = {
    "Acetate",
    "Base",
    "Cumulated_feed_volume_glucose",
    "Cumulated_feed_volume_medium",
    "DOT",
    "Fluo_GFP",
    "Glucose",
    "InducerConcentration",
    "OD600",
    "Probe_Volume",
    "Volume",
    "experiment_id",
    "measurement_id",
    "measurement_time",
    "pH",
    "run_id",
    "unit",
}

assert (
    selected_columns >= table_columns - useless_cols
), f"You forgot to check {remaining_cols - selected_columns}"
assert (
    selected_columns <= table_columns - useless_cols
), f"Superfluous {selected_columns - remaining_cols}"

In [None]:
table[selected_columns].dtypes.sort_index()

In [None]:
table["Base"].unique()

In [None]:
table.dtyp

In [None]:
reactor_dtypes = {
    "Acetate": "Float32",
    "Base": "Int32",
    "Cumulated_feed_volume_glucose": "",
    "Cumulated_feed_volume_medium": "",
    "DOT": "",
    "Fluo_GFP": "",
    "Glucose": "",
    "InducerConcentration": "",
    "OD600": "",
    "Probe_Volume": "",
    "Volume": "",
    "experiment_id": "",
    "measurement_id": "",
    "measurement_time": "",
    "pH": "",
    "run_id": "",
    "unit": "",
}

assert (
    set(setpoints_dtypes) <= setpoint_columns
), f"Superfluous encoing  {set(setpoints_dtypes) - setpoint_columns}"
assert (
    set(setpoints_dtypes) >= selected_columns
), f"You forgot to encode {selected_columns - set(setpoints_dtypes)}"

setpoints_categoricals = {
    "unit": "category",
}

assert set(setpoints_categoricals) <= set(
    setpoints_dtypes
), f"Superfluous encoing {set(setpoints_categoricals) - set(setpoints_dtypes)}"

In [None]:
metadata = metadata.astype(metadata_dtypes)
metadata = metadata.astype(metadata_categoricals)
metadata = metadata[metadata_columns]
metadata = metadata.set_index(["run_id", "experiment_id"])

## Measurements_Array

## Measurements Aggregated