# Calculate metrics for filtering

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
from definitions import ROOT_DIR

- Provide inputs

In [11]:
# Paths
p_root_dir = Path(ROOT_DIR)
p_stats = p_root_dir  / "3_metric_extraction"
p_analysis = p_root_dir / "4_metric_extension"

- Load stats file
- Set [multiindex](https://jessicastringham.net/2019/12/10/multiindex/)
- [Filter multiindex](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#sorting-a-multiindex)

### Identify empty images

- Calculate total occupancy per image by summing it over all 'wells' ('well' = 0 is off-sample region)
- Mark empty images in a new column 'is_empty'

### Calculate metric ratios between spots and background
- Off sample avg imtentisy is avg intensity in 'well' 0
- Off sample occupancy is % of occupied pixels in 'well' 0
- Ratio of signal to background is (average intensity in well n) / (average intensity in well 0)
- Ratio of occupancy is (occupancy in well n) / (occupancy in well 0)

In [14]:
threshold_image_occupancy = 0

for i in p_stats.rglob("*bigreport.csv"):    
    
    fname = i.stem
    p_out = p_analysis / f"{fname}_extended.csv"
    
    ## Load pixel stats report
    stats = pd.read_csv(i)
    stats.neutral_loss.fillna('', inplace=True)
    stats.set_index(['formula', 'adduct', 'neutral_loss', 'well'], inplace=True)
    stats.sort_index(inplace=True) 
    
    ## Identify empty images
    is_empty_series = stats.groupby(['formula', 'adduct', 'neutral_loss'])['occupancy'].sum() <= threshold_image_occupancy
    stats['is_empty'] = False
    # subsetting original data frame by grouped boolean
    stats.loc[is_empty_series, 'is_empty'] = True
    
    ## Calculate additional metrics
    # Unpack values from row with well=0
    off_sample_view = stats.loc[(slice(None), slice(None), slice(None), 0)] # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.IndexSlice.html

    off_sample_sum_int = off_sample_view['average']
    stats['off_sample_avg_int'] = [off_sample_sum_int.loc[(f, a, n)] for (f, a, n, _) in stats.index]

    off_sample_occupancy = off_sample_view['occupancy']
    stats['off_sample_occupancy'] = [off_sample_occupancy.loc[(f, a, n)] for (f, a, n, _) in stats.index]
    
    # Calculate both ratios
    stats['on_off_ratio'] = stats['average'] / (stats['off_sample_avg_int'] + 1)
    stats['occupancy_ratio'] = stats['occupancy'] / (stats['off_sample_occupancy'] + 1)
    
    # Save new spreadsheet with additional columns that were calculated
    stats.reset_index().to_csv(p_out, index=False)    

126.95870637893677
165.43778014183044
124.0313811302185
165.90669989585876
124.82832670211792
166.73460817337036
136.39005756378174
177.09295892715454
123.98486638069153
154.54887175559998
124.35954761505127
167.45342707633972
126.9845860004425
168.51591229438782
137.3595907688141
181.58058261871338
125.21700501441956
165.25028443336487
115.1875159740448
153.47208738327026


In [13]:
# stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,dataset_id,sum,average,occupancy,stdev,msm,is_target,is_empty,off_sample_avg_int,off_sample_occupancy,on_off_ratio,occupancy_ratio
formula,adduct,neutral_loss,well,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
C10H12N2O,+Cl,,0,2021-05-07_19h40m30s,8554.455,0.357284,0.242242,7.331347,0.0,False,False,0.357284,0.242242,0.263235,0.195004
C10H12N2O,+Cl,,2,2021-05-07_19h40m30s,0.0,0.0,0.0,0.0,0.0,False,False,0.357284,0.242242,0.0,0.0
C10H12N2O,+Cl,,10,2021-05-07_19h40m30s,0.0,0.0,0.0,0.0,0.0,False,False,0.357284,0.242242,0.0,0.0
C10H12N2O,+Cl,,11,2021-05-07_19h40m30s,121.67274,1.763373,1.449275,14.541146,0.0,False,False,0.357284,0.242242,1.299192,1.166661
C10H12N2O,+Cl,,12,2021-05-07_19h40m30s,0.0,0.0,0.0,0.0,0.0,False,False,0.357284,0.242242,0.0,0.0
