In [None]:
%reload_ext autoreload
%autoreload 2

import getpass
import mysql.connector
import pandas as pd
import MPDB_utils as ut
from MPDB_settings import MPDB_server, particleQuery
from MPDB_procedures import blank_procedure, blind_procedure, update_env_and_blind, make_syn_blind

## Establish a connection to the MPDB
Enter your username and password when prompted.

In [None]:
connection = mysql.connector.connect(host=MPDB_server,
                                     user=input('Username: '),
                                     passwd=getpass.getpass(prompt='Password: '),
                                     db='micropoll')

## All data wrangling happens in the modules of the following pipeline

- starting with the call of all particles in the MPDB
- certain polymers get excluded (i.e. own contamination or unclear dye signatures)
- geometric mean sizes are calculated from size 1 and 2 (length and width of the GEPARD-fitted ellipses)
- sizes outside the target spectrum are excluded (here only particles >= 50 µm in `Size_1` are targeted)
- shape and colour get simplified for later distinct phenotype creation
- the particles ID column gets used as the index

All the above uses a combined `MP` dataframe as inputs AND outputs. In the last step this dataframe gets filtered to only include the particles relevant in this analysis and splits it in different sub-dataframes
- `env_MP`contains the environmental MP particles (i.e. from "real" samples)
- `IOW_blind_MP` contains the MP particles from procedural IOW blind samples
- `samples_MP` is a concatenation of the two above
- `IPF_blank_MP` contains the MP particles from IPF lab blanks

In [None]:
env_MP, IOW_blind_MP, samples_MP, IPF_blank_MP = pd.read_sql_query(particleQuery, connection
).pipe(ut.poly_exclude  # exclude polymers which are on polyDropList in MPDB_settings.py
).pipe(ut.particle_amplification  # repeat each particle according to fraction analysed
).pipe(ut.geom_mean  # Calculates geometric mean of particle sizes TODO: do we want to calculate it based on two or three dimensions?
).pipe(ut.size_filter  # Currently filters Size1 >= 50 µm TODO: do we want to apply this here or only filter during analysis?
).pipe(ut.shape_colour  # condense shape and colour into few unambiguous categories
).pipe(ut.set_id_to_index  # does what it says
).pipe(ut.separate_MPs)  # separate environmental MP from IOW blinds and IPF blanks

print(f'Using {len(IOW_blind_MP.Sample.unique())} IOW Blind samples:  ')
print(IOW_blind_MP.Sample.unique())

## Removing particles due to lab blanks...

In [None]:
samples_MP_copy, IPF_elimination_list = blank_procedure(samples_MP, IPF_blank_MP)

## Some intermediate steps...
- The blank procedure was conducted on the combined `samples_MP` dataframe. The results are now proted to the separate `env_MP` and `IOW_blind_MP` dataframes.
- A synthesised blind particle dataframe `syn_blind` is generated for the blind procedure.

In [None]:
IOW_blind_MP, env_MP = update_env_and_blind(samples_MP_copy, IOW_blind_MP)

syn_blind = make_syn_blind(IOW_blind_MP)

## Removing particles due to procedural blinds...

In [None]:
env_MP_copy, IOW_elimination_list = blind_procedure(env_MP, syn_blind)

## Export final list of valid MP particles

In [None]:
env_MP_copy.to_csv('../data/mp_pdd.csv')