# Notebook to generate pixel statistics from spotting datasets

In [1]:
%matplotlib inline

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
from metaspace.sm_annotation_utils import SMInstance
from getpass import getpass
import matplotlib.pyplot as plt
import itertools

In [14]:
from definitions import ROOT_DIR

In [15]:
ROOT_DIR

'd:\\saharuka\\spotting\\analysis_prototype'

- Provide inputs

In [17]:
# METASPACE
dataset_id = '2021-02-17_18h14m40s'
database = ('Spotting_project_compounds-v9', 'feb2021')
fdr = 0.5

# Paths
p_root_dir = Path(ROOT_DIR)
p_test = p_root_dir  / "testing_generate_pixel_stats"
p_grid = p_test / f"Masks\grid_masks\\{dataset_id}.npy"
p_wellmap = p_test / "Molecules.csv" # this is well-to-molecule mapping
p_out = p_test / f"{dataset_id}_bigreport.csv"

# Grid specs
sample_wells = ([2]+list(range(10,190)))

- Load full grid
- Subset it to include only wells of interest

In [4]:
grid = np.load(p_grid) # from alignment tool

# Remove unused wells from gridmask
grid[~np.isin(grid, sample_wells)] = 0 
wells = np.unique(grid)

# test
# plt.imshow(grid,vmax=1) 
# plt.show()
# print(wells)

- Load well to molecule mapping

In [5]:
wellmap = pd.read_csv(p_wellmap)

# test
# wellmap.head()

- Log in to METASPACE
- Load dataset

In [6]:
sm = SMInstance()
if not sm.logged_in():
    print('Enter your API key from https://metaspace2020.eu/user/me')
    sm.login(api_key=getpass())
else:
    print('Already logged in')

Enter your API key from https://metaspace2020.eu/user/me
········


In [7]:
ds = sm.dataset(id = dataset_id)

- Make a dictionary, where (formula, adduct, neutral loss) is a key and first isotope image is a value

In [8]:
# TODO:find out what scale intensity does: https://metaspace2020.readthedocs.io/en/latest/content/apireference/sm_annotation_utils.html?highlight=.all_annotation_images#metaspace.sm_annotation_utils.SMDataset.all_annotation_images
images = ds.all_annotation_images(fdr = fdr, 
                                  database = database, 
                                  only_first_isotope = False, # True doesn't work as expected
                                  scale_intensity = True, 
                                  hotspot_clipping = False)

images = dict( ((img.formula, img.adduct, img.neutral_loss), img[0]) for img in images )

- Get results for the whole dataset

In [9]:
results = ds.results(fdr = fdr,
                     database = database,
                     include_neutral_losses = True)

# test
# results.loc[('C5H14N4',  '+H', ''), 'msm']

In [10]:
def statcrunch(well, formula, adduct, neutral_loss, images, grid_mask):
    """
    Statcrunch does measurements on a region of an ion image. 
    Ion image defined by formula nad adduct. Region defined by grid_mask and well.
    :returns: sum intensity, mean intensity, % occupancy, standard deviation and msm
    """
    
    spot_inds = grid_mask == well
    pixels = images[(formula, adduct, neutral_loss)][spot_inds]
    
    if pixels.size == 0:
        out_sum = 0
        out_ave = 0
        out_std = 0
        out_occ = 0
    else:
        out_sum = np.sum(pixels) 
        out_ave = np.average(pixels) 
        out_std = np.std(pixels)
        out_occ = (np.count_nonzero(pixels) / pixels.size) * 100 # % of pixels that have non-zero values

    statvector = [out_sum, out_ave, out_occ, out_std]
    
    return(statvector)

# test
# statcrunch(120, 'C9H14N3O8P', "+H", '', images, grid)

[2800.6604, 75.69353, 54.054054054054056, 82.43422]

Generate a complete set of pixel-by-pixel info

- For every image-formula-adduct triplet, calculate the following
        - sum and average intensity of each target molecule in each well and off sample
        - number of occupied pixels in each well and off sample
        - standard deviation of intensities within each well
        
- Compile into a database indexed by image-well-formula-adduct
          

In [11]:
# test
# import time
# start = time.time()

formulas = wellmap.formula.dropna().unique()
adductlist = ds.adducts 
neutral_losses = [''] + ds.config['isotope_generation'].get('neutral_losses', [''])

addframes = []

for formula, adduct, neutral_loss in list(itertools.product(formulas, adductlist, neutral_losses)):

    for well in wells:

        # Is this formula expected in this well?
        targetflag = wellmap.loc[wellmap.well == well, 'formula'] == formula
        
        if (formula, adduct, neutral_loss) in results.index:
            statvector = statcrunch(well, formula, adduct, neutral_loss, images, grid)
            msm = results.loc[(formula, adduct, neutral_loss), 'msm']

            addframes.append({
                        'dataset_id': dataset_id,
                        'well': well,
                        'formula': formula,
                        'adduct': adduct,
                        'neutral_loss': neutral_loss,
                        'sum': statvector[0],
                        'average': statvector[1],
                        'occupancy': statvector[2],
                        'stdev': statvector[3],
                        'msm': msm,
                        'is_target': targetflag.item(),
            })
        else:
            addframes.append({ 
                        'dataset_id': dataset_id,
                        'well': well,
                        'formula': formula,
                        'adduct': adduct,
                        'neutral_loss': neutral_loss,
                        'sum': '0',
                        'average': '0',
                        'occupancy': '0',
                        'stdev': '0',
                        'msm': '0',
                        'is_target': targetflag.item(),
                    })
        
statframe = pd.DataFrame(addframes)

# test
# end = time.time()
# print(end - start)

80.26296019554138


In [12]:
statframe.to_csv(p_out)