# Notebook to generate pixel statistics from spotting datasets

In [1]:
%matplotlib inline

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
from metaspace.sm_annotation_utils import SMInstance
from getpass import getpass
import matplotlib.pyplot as plt
import itertools

In [3]:
from definitions import ROOT_DIR

In [4]:
def statcrunch(well, formula, adduct, neutral_loss, images, grid_mask):
    """
    Statcrunch does measurements on a region of an ion image. 
    Ion image defined by formula nad adduct. Region defined by grid_mask and well.
    :returns: sum intensity, mean intensity, % occupancy, standard deviation and msm
    """
    
    spot_inds = grid_mask == well
    pixels = images[(formula, adduct, neutral_loss)][spot_inds]
    
    if pixels.size == 0:
        out_sum = 0
        out_ave = 0
        out_std = 0
        out_occ = 0
    else:
        out_sum = np.sum(pixels) 
        out_ave = np.average(pixels) 
        out_std = np.std(pixels)
        out_occ = (np.count_nonzero(pixels) / pixels.size) * 100 # % of pixels that have non-zero values

    statvector = [out_sum, out_ave, out_occ, out_std]
    
    return(statvector)

- Provide inputs

In [9]:
# METASPACE
database = ('Spotting_project_compounds-v9', 'feb2021')
fdr = 0.5

# Paths
p_root_dir = Path(ROOT_DIR)
p_analysis = p_root_dir  / "3_metric_extraction"
p_grids = p_root_dir / r"2_grid_calibration\grid_masks"
p_wellmap = p_analysis / "Molecules.csv" # this is well-to-molecule mapping

# Grid specs
sample_wells = ([2]+list(range(10,190)))

- Load well to molecule mapping

In [10]:
wellmap = pd.read_csv(p_wellmap)

# test
wellmap.head()

Unnamed: 0,well,name,formula,mass
0,0,Off-sample region,,
1,2,1-palmitoyl-2-oleoyl-sn-glycero-3-phospho-(1'-...,C40H77O10P,748.525437
2,10,Indole,C8H7N,117.057849
3,11,Prostaglandin E1,C20H34O5,354.240625
4,12,1-palmitoyl-2-oleoyl-sn-glycero-3-phosphate (s...,C37H71O8P,674.488657


- Log in to METASPACE
- Load dataset

In [11]:
sm = SMInstance(host='https://staging.metaspace2020.eu')
if not sm.logged_in():
    print('Enter your API key from https://metaspace2020.eu/user/me')
    sm.login(api_key=getpass())
else:
    print('Already logged in')

Enter your API key from https://metaspace2020.eu/user/me
········


- Load full grid
- Subset it to include only wells of interest

- Make a dictionary, where (formula, adduct, neutral loss) is a key and first isotope image is a value
- Get results for the whole dataset

Generate a complete set of pixel-by-pixel info

- For every image-formula-adduct triplet, calculate the following
        - sum and average intensity of each target molecule in each well and off sample
        - number of occupied pixels in each well and off sample
        - standard deviation of intensities within each well
        
- Compile into a database indexed by image-well-formula-adduct
          

In [13]:
formulas = wellmap.formula.dropna().unique()

for i in p_grids.rglob("*.npy"):
    
    #load grid
    grid = np.load(i) # from alignment tool
    # Remove unused wells from gridmask
    grid[~np.isin(grid, sample_wells)] = 0 
    wells = np.unique(grid)
   
    fname = i.stem
    dataset_id = fname[-20:]
    p_out = p_analysis / f"{fname}_bigreport.csv"
    
    ds = sm.dataset(id = dataset_id)
    
    # make a dict out of all images
    images = ds.all_annotation_images(fdr = fdr, 
                                      database = database,
                                      only_first_isotope = False,
                                      scale_intensity = True, 
                                      hotspot_clipping = False)
    images = dict( ((img.formula, img.adduct, img.neutral_loss), img[0]) for img in images )
    
    # get results
    results = ds.results(fdr = fdr,
                     database = database,
                     include_neutral_losses = True)
    
    # calculate stats
    adductlist = ds.adducts 
    neutral_losses = [''] + ds.config['isotope_generation'].get('neutral_losses', [''])
    addframes = []
    for formula, adduct, neutral_loss in list(itertools.product(formulas, adductlist, neutral_losses)):
        for well in wells:

            # Is this formula expected in this well?
            targetflag = wellmap.loc[wellmap.well == well, 'formula'] == formula

            if (formula, adduct, neutral_loss) in results.index:
                statvector = statcrunch(well, formula, adduct, neutral_loss, images, grid)
                msm = results.loc[(formula, adduct, neutral_loss), 'msm']

                addframes.append({
                            'dataset_id': dataset_id,
                            'well': well,
                            'formula': formula,
                            'adduct': adduct,
                            'neutral_loss': neutral_loss,
                            'sum': statvector[0],
                            'average': statvector[1],
                            'occupancy': statvector[2],
                            'stdev': statvector[3],
                            'msm': msm,
                            'is_target': targetflag.item(),
                })
            else:
                addframes.append({ 
                            'dataset_id': dataset_id,
                            'well': well,
                            'formula': formula,
                            'adduct': adduct,
                            'neutral_loss': neutral_loss,
                            'sum': '0',
                            'average': '0',
                            'occupancy': '0',
                            'stdev': '0',
                            'msm': '0',
                            'is_target': targetflag.item(),
                        })

    statframe = pd.DataFrame(addframes)
    statframe.to_csv(p_out, index=False)

1193.8513717651367
1547.864743232727
1166.015422821045
1580.275161266327
1187.8191714286804
1717.570696592331
1397.6427297592163
1755.7991993427277
1288.6113345623016
1799.9397172927856
1220.7362859249115
1705.0644536018372
1233.8087327480316
1616.553698539734
1475.3150708675385
1941.393452167511
1316.0111668109894
1680.6537399291992
1104.5182552337646
1501.3794050216675
