# Calculate metrics for filtering

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
from definitions import ROOT_DIR

- Provide inputs

In [3]:
# METASPACE
dataset_id = '2021-02-17_18h14m40s'
database = ('Spotting_project_compounds-v9', 'feb2021')
fdr = 0.5

# Paths
p_root_dir = Path(ROOT_DIR)
p_test = p_root_dir  / "testing_generate_pixel_stats"
p_stats = p_test / f"{dataset_id}_bigreport.csv"
p_out = p_test / f"{dataset_id}_bigreport_extended.csv"

# Grid specs
sample_wells = ([2]+list(range(10,190)))

- Load stats file
- Set [multiindex](https://jessicastringham.net/2019/12/10/multiindex/)
- [Filter multiindex](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#sorting-a-multiindex)

In [4]:
stats = pd.read_csv(p_stats)
stats.neutral_loss.fillna('', inplace=True)
stats.set_index(['formula', 'adduct', 'neutral_loss', 'well'], inplace=True)
stats.sort_index(inplace=True) 

In [5]:
stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,dataset_id,sum,average,occupancy,stdev,msm,is_target
formula,adduct,neutral_loss,well,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C10H12N2O,+H,,0,2021-02-17_18h14m40s,87568.94,3.04006,0.045131,257.70724,0.967008,False
C10H12N2O,+H,,2,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False
C10H12N2O,+H,,10,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False
C10H12N2O,+H,,11,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False
C10H12N2O,+H,,12,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False


### Identify empty images

- Calculate total occupancy per image by summing it over all 'wells' ('well' = 0 is off-sample region)
- Mark empty images in a new column 'is_empty'

In [6]:
threshold_image_occupancy = 0
is_empty_series = stats.groupby(['formula', 'adduct', 'neutral_loss'])['occupancy'].sum() <= threshold_image_occupancy
stats['is_empty'] = False

# subsetting original data frame by grouped boolean
stats.loc[is_empty_series, 'is_empty'] = True

### Calculate metric ratios between spots and background
- Off sample avg imtentisy is avg intensity in 'well' 0
- Off sample occupancy is % of occupied pixels in 'well' 0
- Ratio of signal to background is (average intensity in well n) / (average intensity in well 0)
- Ratio of occupancy is (occupancy in well n) / (occupancy in well 0)

In [7]:
# Unpack values from row with well=0
off_sample_view = stats.loc[(slice(None), slice(None), slice(None), 0)] # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.IndexSlice.html

off_sample_sum_int = off_sample_view['average']
stats['off_sample_avg_int'] = [off_sample_sum_int.loc[(f, a, n)] for (f, a, n, _) in stats.index]

off_sample_occupancy = off_sample_view['occupancy']
stats['off_sample_occupancy'] = [off_sample_occupancy.loc[(f, a, n)] for (f, a, n, _) in stats.index]

In [8]:
# Calculate both ratios
stats['on_off_ratio'] = stats['average'] / (stats['off_sample_avg_int'] + 1)
stats['occupancy_ratio'] = stats['occupancy'] / (stats['off_sample_occupancy'] + 1)

In [9]:
stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,dataset_id,sum,average,occupancy,stdev,msm,is_target,is_empty,off_sample_avg_int,off_sample_occupancy,on_off_ratio,occupancy_ratio
formula,adduct,neutral_loss,well,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
C10H12N2O,+H,,0,2021-02-17_18h14m40s,87568.94,3.04006,0.045131,257.70724,0.967008,False,False,3.04006,0.045131,0.752479,0.043182
C10H12N2O,+H,,2,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False,False,3.04006,0.045131,0.0,0.0
C10H12N2O,+H,,10,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False,False,3.04006,0.045131,0.0,0.0
C10H12N2O,+H,,11,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False,False,3.04006,0.045131,0.0,0.0
C10H12N2O,+H,,12,2021-02-17_18h14m40s,0.0,0.0,0.0,0.0,0.967008,False,False,3.04006,0.045131,0.0,0.0


In [None]:
# Save new spreadsheet with additional columns that were calculated
stats.reset_index().to_csv(p_out, index=False)