# Create a set of experiments

This notebook allows to create a set of experiments. In the cell below, the configuration can be changed. The following steps are performed:
1. load all objects from the database (including polymer_type)
2. load all containers from the database
3. filter objects either by "included_types" and/or "non_usable_types" (see configuration)
4. stratified sampling of objects based on polymer_type
5. create a set of experiments with a random number of objects per experiment (see configuration, based on the scipy.stats distribution and its parameters)
6. create a set of containers (randomly sampled from the containers in the database)
7. write experiments to database
8. write a set of json instruction files to data folder

## Imports and Configuration

In [244]:
import numpy as np
import pandas as pd
import db_queries
import db_builder
import create_experiments_helpers as helpers

In [245]:
# configuration
session_name: str = db_builder.get_next_session_id()
min_length: int = 2 # min number of foreign objects per experiment
max_length: int = 5 # max number of foreign objects per experiment
n_experiments: int = 10 # number of experiments to create
random_state: int = int(session_name[-2:]) * 10 # random state for reproducibility
distribution_name: str = 'alpha' # distribution of length of experiment, choose any scipy.stats distribution
distribution_params: dict = {'a': 4} # parameters of distribution
non_usable_types: list = ['no sample', 'unclear'] # types that are not usable for experiments
included_types: list = None # ['pe-hd','pe-ld','pp','pet'] # if not None, only these types are included in experiments

target_name: str = 'type' # name of target column, stratified sampling is based on this column

## Load objects and containers

In [246]:
# load excel, additional sheets if needed are containers and description (with column_name description)
con = db_builder.get_db_connection()

df_objects = pd.read_sql("SELECT * FROM objects WHERE reference_image == True", con)
df_containers = pd.read_sql("SELECT * FROM containers", con)

In [247]:
df_objects.head()

Unnamed: 0,object_id,polymer_type,length,texture,stiffness,color,contamination,form,note,reference_image
0,1.1,pe,33.0,rough,soft,gray,dirty,foil,,1
1,1.2,cellophan,12.0,rough,soft,gray,dirty,,60% Übereinstimmung,1
2,1.3,pp,10.0,rough,soft,white,dirty,foil,,1
3,1.4,pe,20.0,rough,soft,white,clean,fiber,Dünne Faser,1
4,1.5,pp,25.0,rough,soft,white,clean,fiber,Dünne Faser,1


In [248]:
df_containers.head()

Unnamed: 0,material_type,container_id,company,location,date,note
0,compost,CO-01-a,,,,
1,compost,Co-01-b,,,,
2,compost,CO-01-c,,,,
3,digestive,DI-01-a,,,,
4,digestive,DI-01-b,,,,


## Pre-Processing of objects

In [249]:
df_objects = df_objects[['object_id', 'polymer_type']]

In [250]:
# drop rows with unsuitable type
df_objects = df_objects[~df_objects['polymer_type'].isin(non_usable_types)]

In [251]:
# filter objects
if included_types is not None:
    df_objects = df_objects[df_objects['polymer_type'].isin(included_types)]

## Stratified sampling

In [252]:
distribution = helpers.load_scipy_distribution_by_name(distribution_name, distribution_params)
n_objects = distribution.rvs(n_experiments)

In [253]:
experiment_lengths = helpers.get_experiment_lengths(distribution, n_experiments, min_length, max_length)

In [254]:
containers = df_containers.sample(n=n_experiments, replace=True, random_state=random_state)['container_id'].values

In [255]:
containers = np.array(['CO-06', 'DI-07', 'DI-05', 'DI-11', 'CO-03', 'CO-04']).T
containers = np.random.choice(containers, size=n_experiments, replace=True)

## Create experiments

In [256]:
# stratified sampling
start_id = db_builder.get_next_experiment_id()
session_id = session_name
experiment_ids = helpers.get_experiment_ids(start_id, n_experiments)

created_experiments = pd.DataFrame(columns=['session_id', 'experiment_id', 'container_id', 'objects', 'n_objects'])


for i, n in enumerate(experiment_lengths):
    ids = df_objects.sample(n=n, replace=False, random_state=random_state)['object_id'].values
    ids = [str(a) for a in ids]
    ids = [a.replace('.', '_') for a in ids]
    created_experiments.loc[len(created_experiments)] = {
        'session_id': session_id.strip(),
        'experiment_id': experiment_ids[i].strip(),
        'container_id': containers[i].strip(),
        'objects': ids,
        'n_objects': len(ids)
    }

    random_state += 1

created_experiments.head()

Unnamed: 0,session_id,experiment_id,container_id,objects,n_objects
0,S0040,E0440,CO-04,"[3_10, 11_15]",2
1,S0040,E0441,CO-04,"[2_2, 13_8]",2
2,S0040,E0442,CO-04,"[8_6, 4_6, 13_4]",3
3,S0040,E0443,DI-11,"[7_15, 13_10, 4_10, 7_14, 7_5]",5
4,S0040,E0444,CO-04,"[4_9, 11_15]",2


### Create Session

In [257]:
db_queries.put_session(session_id=session_id, n_experiments=n_experiments, note='testing sample software', responsible="Silvan Rehm", start_date='2023-05-15 00:00:00', end_date='2023-05-15 00:00:00')

## Update database

In [258]:
db_queries.put_multiple_experiments(created_experiments.drop(columns=['objects']))

In [259]:
# link experiments to objects
for idx, row in created_experiments.iterrows():
    db_queries.link_experiment_objects(row['experiment_id'], row['objects'])

In [260]:
db_queries.run_query("SELECT * FROM experiment_objects ORDER BY experiment_id DESC LIMIT 10")

Unnamed: 0,experiment_id,object_id
0,E0449,17_9
1,E0449,13_2
2,E0448,17_1
3,E0448,15_10
4,E0448,11_11
5,E0447,9_4
6,E0447,15_3
7,E0446,15_1
8,E0446,14_7
9,E0445,9_5
