# Create a set of experiments

This notebook allows to create a set of experiments. In the cell below, the configuration can be changed. The following steps are performed:
1. load all objects from the database (including polymer_type)
2. load all containers from the database
3. filter objects either by "included_types" and/or "non_usable_types" (see configuration)
4. stratified sampling of objects based on polymer_type
5. create a set of experiments with a random number of objects per experiment (see configuration, based on the scipy.stats distribution and its parameters)
6. create a set of containers (randomly sampled from the containers in the database)
7. write experiments to database
8. write a set of json instruction files to data folder

## Imports and Configuration

In [2]:
import numpy as np
import pandas as pd
import db_queries
import db_builder
import create_experiments_helpers as helpers

In [3]:
# configuration
session_name: str = db_builder.get_next_session_id()
min_length: int = 2 # min number of foreign objects per experiment
max_length: int = 5 # max number of foreign objects per experiment
n_experiments: int = 5 # number of experiments to create
random_state: int = 42 # random state for reproducibility
distribution_name: str = 'alpha' # distribution of length of experiment, choose any scipy.stats distribution
distribution_params: dict = {'a': 4} # parameters of distribution
non_usable_types: list = ['no sample', 'unclear'] # types that are not usable for experiments
included_types: list = ['pe-hd','pe-ld','pp','pet'] # if not None, only these types are included in experiments

target_name: str = 'type' # name of target column, stratified sampling is based on this column

## Load objects and containers

In [4]:
# load excel, additional sheets if needed are containers and description (with column_name description)
con = db_builder.get_db_connection()

df_objects = pd.read_sql("SELECT * FROM objects", con)
df_containers = pd.read_sql("SELECT * FROM containers", con)

In [5]:
df_objects.head()

Unnamed: 0,object_id,polymer_type,length,texture,stiffness,color,contamination,form,note,reference_image
0,1.1,pe,33.0,rough,soft,gray,dirty,foil,,1
1,1.2,cellophan,12.0,rough,soft,gray,dirty,,60% Übereinstimmung,1
2,1.3,pp,10.0,rough,soft,white,dirty,foil,,1
3,1.4,pe,20.0,rough,soft,white,clean,fiber,Dünne Faser,1
4,1.5,pp,25.0,rough,soft,white,clean,fiber,Dünne Faser,1


In [6]:
df_containers.head()

Unnamed: 0,material_type,container_id,company,location,date,note
0,compost,CO-01-a,,,,
1,compost,Co-01-b,,,,
2,compost,CO-01-c,,,,
3,digestive,DI-01-a,,,,
4,digestive,DI-01-b,,,,


## Pre-Processing of objects

In [7]:
df_objects = df_objects[['object_id', 'polymer_type']]

In [8]:
# drop rows with unsuitable type
df_objects = df_objects[~df_objects['polymer_type'].isin(non_usable_types)]

In [10]:
# filter objects
if included_types is not None:
    df_objects = df_objects[df_objects['polymer_type'].isin(included_types)]

## Stratified sampling

In [12]:
distribution = helpers.load_scipy_distribution_by_name(distribution_name, distribution_params)
n_objects = distribution.rvs(n_experiments)

In [14]:
experiment_lengths = helpers.get_experiment_lengths(distribution, n_experiments, min_length, max_length)

In [16]:
containers = df_containers.sample(n=n_experiments, replace=True)['container_id'].values

## Create experiments

In [22]:
# stratified sampling
start_id = db_builder.get_next_experiment_id()
session_id = db_builder.get_next_session_id()
experiment_ids = helpers.get_experiment_ids(start_id, n_experiments)

created_experiments = pd.DataFrame(columns=['session_id', 'experiment_id', 'container_id', 'objects', 'n_objects'])


for i, n in enumerate(experiment_lengths):
    ids = df_objects.sample(n=n, replace=False)['object_id'].values
    ids = [str(a) for a in ids]
    ids = [a.replace('.', '_') for a in ids]
    created_experiments.loc[len(created_experiments)] = {
        'session_id': session_id.strip(),
        'experiment_id': experiment_ids[i].strip(),
        'container_id': containers[i].strip(),
        'objects': ids,
        'n_objects': len(ids)
    }

created_experiments.head()

Unnamed: 0,session_id,experiment_id,container_id,objects,n_objects
0,S0001,E0040,CO-01-a,"[15_6, 6_8]",2
1,S0001,E0041,CO-01-a,"[7_1, 11_2]",2
2,S0001,E0042,DI-01-a,"[16_4, 6_5a, 3_13, 11_6, 11_14]",5
3,S0001,E0043,DI-01-b,"[16_14, 12_4, 13_8]",3
4,S0001,E0044,Co-01-b,"[9_1, 12_8a, 14_6]",3


### Create Session

In [32]:
db_queries.put_session(session_id=session_id, n_experiments=n_experiments, note='db_test', responsible="Roman Studer", start_date='', end_date='')

## Update database

In [24]:
db_queries.put_multiple_experiments(created_experiments.drop(columns=['objects']))

In [30]:
# link experiments to objects
for idx, row in created_experiments.iterrows():
    db_queries.link_experiment_objects(row['experiment_id'], row['objects'])

IntegrityError: UNIQUE constraint failed: experiment_objects.experiment_id, experiment_objects.object_id

In [29]:
db_queries.run_query("SELECT * FROM experiment_objects ORDER BY experiment_id DESC LIMIT 10")

Unnamed: 0,experiment_id,object_id
0,E0044,9_1
1,E0044,14_6
2,E0044,12_8a
3,E0043,16_14
4,E0043,13_8
5,E0043,12_4
6,E0042,6_5a
7,E0042,3_13
8,E0042,16_4
9,E0042,11_6
