# Title

In [1]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Format


```python
dict[int, # run_id
    dict[int, # experiment_id
         dict[
             'metadata',: DataFrame,                # static
             'setpoints': DataFrame,                # static
             'measurements_reactor',: DataFrame,    # TimeTensor
             'measurements_array',: DataFrame,      # TimeTensor
             'measurements_aggregated': DataFrame,  # TimeTensor
         ]
    ]
]

```

In [19]:
import pickle
import pandas
import pandas as pd
import numpy as np
from pandas import DataFrame
from cross_validate_kiwi_runs import create_replicate_dict, ReplicateBasedSplitter

with open("kiwi_experiments_and_run_355.pk", "rb") as file:
    data = pickle.load(file)

col_run_to_exp = create_replicate_dict(data)
splitter = ReplicateBasedSplitter()

In [57]:
DATA = [(data[run][exp] | {"run_id": run, "experiment_id": exp}) for run in data for exp in data[run]]
DF = DataFrame(DATA).set_index(["run_id", "experiment_id"])

In [75]:
tables = {}

for key in ('metadata', 'setpoints', 'measurements_reactor', 'measurements_array', 'measurements_aggregated'):
    if key == "metadata":
        tables[key] = pd.concat(iter(DF[key])).reset_index(drop=True)
    else:
        tables[key] = pd.concat(iter(DF[key]), keys=DF[key].index).reset_index(level=2, drop=True).reset_index()

tables

## MetaData Preprocessing

In [76]:
def contains_no_information(series) -> bool:
    return len(series.dropna().unique()) <= 1

def contains_nan_slice(series, slices, two_enough: bool=False) -> bool:
    num_missing = 0
    for idx in slices:
        if pd.isna(series[idx]).all():
            num_missing += 1
            
    if (num_missing > 0 and not two_enough) or (num_missing >= len(slices)-1 and two_enough):
        print(f"{num_missing}/{len(slices)} missing")
        return True
    return False

def get_μ_set(s: str):
    if pd.isna(s):
        return s
    s = s.strip().lstrip("µ_set: ").strip()
    percent, s = s.split(", ")
    value = s.strip().rstrip("mM IPTG").strip()
    return percent, value

In [77]:
table = metadata = tables["metadata"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"]==run for run in runs]

useless_cols = set()

for col in table:
    s = table[col]
    if col in ("run_id", "experiment_id"):
        continue
    if contains_no_information(s):
        print(f"No information in      {col}")
        useless_cols.add(col)
    elif contains_nan_slice(s, run_masks):
        print(f"Missing for some run   {col}")
        useless_cols.add(col)

# drop the following specific columns
useless_cols |= {'folder_id_y', 'ph_Base_conc', 'ph_Ki', 'ph_Kp', 'ph_Tolerance', 'pms_id'}

In [101]:
metadata_columns = {
    "bioreactor_id",
    "container_number",
    "experiment_id",
    "run_id",
    "profile_id",
    "color",
    "profile_name",
    "organism_id",
    "OD_Dilution",
    "run_name",
    "start_time",
    "end_time",
}

remaining_cols = set(metadata.columns) - useless_cols
assert metadata_columns >= set(metadata.columns) - useless_cols, f"Superfluous {metadata_columns - remaining_cols}"
assert metadata_columns <= set(metadata.columns) - useless_cols, f"You forgot to check {metadata_columns - remaining_cols}"


metadata_dtypes = {
    "experiment_id": "UInt32",
    "bioreactor_id": "UInt32", 
    "container_number": "UInt32",
    "profile_id": "UInt32",
    "color": "string", 
    "profile_name": "string",
    "organism_id": "UInt32",
    "run_id": "UInt32",
    "OD_Dilution": "float32",
    "run_name": "string",
    "start_time": "datetime64[ns]",
    "end_time": "datetime64[ns]", 
}

assert metadata_columns >= set(metadata_dtypes), f"Superfluous encoing  {set(metadata_dtypes) - metadata_columns}"
assert metadata_columns <= set(metadata_dtypes), f"You forgot to encode {metadata_columns - set(metadata_dtypes)}"

metadata_categoricals = {
    "profile_name": "category",
    "run_name": "category",
    "color": "category",
    "OD_Dilution": "Float32",
}

assert metadata_columns >= set(metadata_categoricals), f"Superfluous encoing {set(metadata_categoricals) - metadata_columns}"

In [102]:
metadata = metadata.astype(metadata_dtypes)
metadata = metadata.astype(metadata_categoricals)
metadata = metadata[metadata_columns]
metadata = metadata.set_index(["run_id", "experiment_id"])

## Setpoint Preprocessing

In [113]:
table = setpoints = tables["setpoints"]
runs = table["run_id"].dropna().unique()
run_masks = [table["run_id"]==run for run in runs]

useless_cols = set()

for col in table:
    s = table[col]
    if col in ("run_id", "experiment_id"):
        continue
    if contains_no_information(s):
        print(f"No information in      {col}")
        useless_cols.add(col)
    elif contains_nan_slice(s, run_masks, two_enough=True):
        print(f"Missing for some run   {col}")
        useless_cols.add(col)

# drop the following specific columns
useless_cols

In [114]:
setpoint_columns = set(setpoints.columns)
remaining_cols = setpoint_columns - useless_cols

selected_columns = {
    "experiment_id",
    "run_id",
    'cultivation_age',
    'setpoint_id',
    'unit',
    'Puls_Glucose',
    'StirringSpeed',
    'Feed_glc_cum_setpoints',
    'Flow_Air',
    'InducerConcentration',
}

assert selected_columns >= setpoint_columns - useless_cols, f"You forgot to check {remaining_cols - selected_columns}"
assert selected_columns <= setpoint_columns - useless_cols, f"Superfluous {selected_columns - remaining_cols}"

setpoints_dtypes = {
    "experiment_id"                  : "UInt32",
    "run_id"                         : "UInt32",
    "cultivation_age"                : "UInt32",
    "setpoint_id"                    : "UInt32",
    "unit"                           : "string",
    "Puls_AceticAcid"                : "Float32",
    "Puls_Glucose"                   : "Float32",
    "Puls_Medium"                    : "Float32",
    "StirringSpeed"                  : "UInt16",
    "pH"                             : "Float32",
    "Feed_glc_cum_setpoints"         : "UInt16",
    "Flow_Air"                       : "UInt8",
    "InducerConcentration"           : "Float32",
    "Flow_Nitrogen"                  : "Float32",
    "Flow_O2"                        : "Float32",
    "Feed_dextrine_cum_setpoints"    : "Float32",
}

assert set(setpoints_dtypes) <= setpoint_columns, f"Superfluous encoing  {set(setpoints_dtypes) - setpoint_columns}"
assert set(setpoints_dtypes) >= selected_columns, f"You forgot to encode {selected_columns - set(setpoints_dtypes)}"

setpoints_categoricals = {
    "unit"  : "category",
}

assert set(setpoints_categoricals) <= set(setpoints_dtypes), f"Superfluous encoing {set(setpoints_categoricals) - set(setpoints_dtypes)}"

In [115]:
setpoints["unit"] = setpoints["unit"].replace(to_replace="-", value=pd.NA)
setpoints = setpoints.astype(setpoints_dtypes)
setpoints = setpoints.astype(setpoints_categoricals)
setpoints = setpoints[selected_columns]
setpoints = setpoints.set_index(["run_id", "experiment_id"])

In [35]:
setpoints["unit"].unique()

In [32]:
[col for col in setpoints if len(setpoints[col].unique())>2]

In [23]:
np.allclose(setpoints["Feed_glc_cum_setpoints"], setpoints["Feed_glc_cum_setpoints"].astype("Int32").astype("float"), equal_nan=True)

In [33]:
setpoints["pH"].unique()

In [193]:
metadata["pH_correction_factor"].astype(pd.Float32Dtype())

In [187]:
metadata.astype(dtypes)

In [185]:
metadata["organism_id"].unique()

In [144]:
metadata['folder_id_x'].unique()

In [121]:
DF['setpoints'].reset_index()

In [106]:
pd.concat(DF['setpoints'].to_dict())

In [71]:
DF[['setpoints']]

In [81]:
DataFrame(list(DF[['setpoints']].iteritems()))

In [93]:
DF['setpoints']

In [98]:
DF[['setpoints']].to_dict("split")

In [31]:
data[355][11722]['measurements_reactor']

In [35]:
meta = [data[run][exp]["metadata"] for run in data for exp in data[run]]

In [39]:
pd.concat(meta)

In [40]:
for train_keys, test_keys in splitter.split(col_run_to_exp):
      
    data_train = [data[k[0]][k[1]] | {"run_id":k[0], "experiment_id": k[1]} for k in train_keys]
    data_test = {k:data[k[0]][k[1]] for k in test_keys}  
    
    
#     data_train = {k:data[k[0]][k[1]] for k in train_keys}
#     data_test = {k:data[k[0]][k[1]] for k in test_keys}


```python
data_train: dict[(int, int), 
    dict[
        'metadata': DataFrame,                  # MetaData
        'setpoints': DataFrame,                 # MetaData
        'measurements_reactor': DataFrame,      # TimeTensor 
        'measurements_array': DataFrame,        # TimeTensor
        'measurements_aggregated' : DataFrame,  # TimeTensor
    ]
]
```

In [43]:
DF = DataFrame(data_train).set_index(["run_id", "experiment_id"])

In [48]:
pd.concat(list(DF["metadata"]))

In [5]:
single_run = data_train[(484, 16331)]

In [14]:
DataFrame.from_dict(data_train,orient='index', index=["run_id", "experiment_id"])

In [9]:
big_df = DataFrame(columns=single_run.keys())

In [10]:
big_df[123] = single_run
big_df

In [44]:
DataFrame.from_dict(single_run, index=[(484, 16331)])

In [32]:
pd.concat(data_train)

In [27]:
len(list(splitter.split(col_run_to_exp)))

In [25]:
data_train[(484,16331)]['measurements_aggregated']

### Data Validation

Format

```python
data: tuple[
    dict[
        'description': set[str],
        'variables': list[dict[str, str]],
        'time_format': str,
        'series': dict[int, 
            dict['generating_parameters': 
                 dict[
                     'qsmax': float,
                     'qm': float,
                     'qamax': float,
                     'Yem': float,
                     'Yxsof': float,
                     'Yxa': float,
                     'Yos': float,
                     'Yoa': float,
                     'Yas': float,
                     'Kia': float,
                     'Ks': float,
                     'Ko': float,
                     'Kap': float,
                     'Kis': float,
                     'Ksa': float,
                     'Pamax': float,
                     'F0': float,
                     'mu_set': float,
                     'C_feed': float,
                     'Kp': float,
                ]
            ],
        ],
    ], 
    dict[int, DataFrame[columns=['X', 'S', 'A', 'DOTm', 'V', 'pulse', 'kLa']]]
]
```

In [28]:
data[0]['series'][4882]['generating_parameters']

In [9]:
data[0]['variables']