In [86]:
import pandas as pd
import numpy as np
from pathlib import Path
from definitions import ROOT_DIR
import seaborn as sns

In [87]:
def calculate_detected_intensities(df, threshold=0.8):
    '''
    Make a column with background corrected intensities for detected compounds, and 0s for not detected compounds
    Change any negative values to zero
    Also add detectability column, where compounds with prediction value above threshold=0.8 are labelled as detected (1)
    '''

    df['detectability'] = df.pred_val >= threshold
    vals = df.spot_intensity_bgr_corrected * df.detectability
    df['effective_intensity'] = np.clip(vals, 0, None)
    return df

In [88]:
p_root_dir = Path(ROOT_DIR).parents[0]
p_data = p_root_dir / "5_data_analysis"

# Metrics and Catboost predictions for all ions in their target wells
p_predictions = p_data / "all_predictions_curated_11-Dec-2021.csv"

# Dataset info (lab, matrix, polarity, m/z range, ids, etc.)
p_datasets = p_data / "Datasets_1Jul2022.csv"

# Classification
p_chem_class = p_data / "custom_classification_v2.csv"
p_pathways = p_data / "pathways_v2.csv"

In [89]:
# Load predictions, format neutral loss column
predictions = pd.read_csv(p_predictions, index_col=0)[['dataset_id', 
                                                       'name_short', 
                                                       'adduct', 
                                                       'neutral_loss', 
                                                       'pred_val',
                                                       'spot_intensity_bgr_corrected']]
predictions.neutral_loss.fillna('', inplace=True)

In [90]:
# Add metadata
classes1 = pd.read_csv(p_chem_class, index_col='internal_id')
chem_class = classes1[['name_short', 'main_coarse_class']].drop_duplicates()
chem_subclass = classes1[['name_short', 'coarse_class', 'fine_class']] # coarse class here is to do sorting if you want

classes2 = pd.read_csv(p_pathways, index_col='internal_id')
pathways = classes2[['name_short', 'main_coarse_path']].drop_duplicates()
pathway_subclass = classes2[['name_short', 'coarse_path', 'fine_path']] # same here

# Get a subset of most relevant information from Datasets file
datasets = pd.read_csv(p_datasets)
datasets_info = datasets.groupby('Dataset ID').first()[['Polarity', 'Matrix short', 'Matrix long', 'Slide code', 'EMBL']]
datasets_info['sample_name'] = datasets_info['Matrix short']

# Merge with predictions
df = pd.merge(predictions, datasets_info, left_on='dataset_id', right_on='Dataset ID', how='left')

In [91]:
# Filter to keep only datasets chosen for plots about matrix comparison
df = df[df.EMBL]

In [92]:
# only consider data of detected ions
df = calculate_detected_intensities(df, threshold=0.8)
data = df[df.detectability]

### Here you would also have relevant filtering steps

## Example for any axis values except class, subclass, pathway and pathway subclass

In [93]:
# Aggregate data from individual ions per metabolite ('name_short'), per dataset ('dataset_id') and axis values

# for example for these axes
X='adduct'
Y='Matrix short'

step1 = data.pivot_table(index=['dataset_id', 'name_short', X, Y],
            values=['effective_intensity', 'detectability'],
            aggfunc=
                {'effective_intensity':sum,
                'detectability':max})

step1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,detectability,effective_intensity
dataset_id,name_short,adduct,Matrix short,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-06-18_10h37m54s,2-Oxoglutaric acid,+Cl,9AA,True,151.03702
2021-06-18_10h37m54s,2-Oxoglutaric acid,-H,9AA,True,65553.20693
2021-06-18_10h37m54s,3-Hydroxyanthranilic acid,+Cl,9AA,True,41.60611
2021-06-18_10h37m54s,3-Hydroxyanthranilic acid,-H,9AA,True,5885.24473
2021-06-18_10h37m54s,3-Hydroxyanthranilic acid,[M]-,9AA,True,2547.082945


In [94]:
# Aggregate data per dataset and axis values
# Calculate what fraction metabolites in this dataset were detected with a given X, Y axis value
# There are 172 metaboites in total

n_metabolites = df.name_short.nunique()
step2 = step1.groupby(['dataset_id', X, Y]).agg({
                                                'effective_intensity':'mean',
                                                'detectability': lambda x: sum(x)/n_metabolites}) 


step2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,effective_intensity,detectability
dataset_id,adduct,Matrix short,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-06-18_10h37m54s,+Cl,9AA,752.467978,0.360465
2021-06-18_10h37m54s,-H,9AA,98242.664881,0.77907
2021-06-18_10h37m54s,[M]-,9AA,7726.594086,0.186047
2021-06-18_10h41m59s,+Cl,CHCA,80.200649,0.02907
2021-06-18_10h41m59s,-H,CHCA,11340.135429,0.523256


In [95]:
# Finally, take the average of results of all datasets

step3 = step2.groupby([X, Y]).agg({
                                'effective_intensity' : 'mean',
                                'detectability' : 'mean'})

step3.rename(columns={'detectability':'fraction_detected'}, inplace=True)
step3['log10_intensity'] = np.log10(step3['effective_intensity']+1)
step3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,effective_intensity,fraction_detected,log10_intensity
adduct,Matrix short,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
+Cl,9AA,752.467978,0.360465,2.877065
+Cl,CHCA,80.200649,0.02907,1.90956
+Cl,CMBT,97.961136,0.081395,1.995465
+Cl,ClCCA,15.033169,0.023256,1.205019
+Cl,DAN,411.088459,0.215116,2.61499


## Example for when you have class in axes

- chem_class.main_chem_class are chemical classes
- chem_subclass.fine_class are chemical subclasses
- pathways.main_coarse_path is the main pathway
- pathway_subclass.fine_path is the pathway "subclass"

In [96]:
# Here I use an example of fine chemical class
# I calcuate the size of each class first

chem = chem_subclass[chem_subclass.coarse_class != 'Thermometers']
sizes = chem_subclass.fine_class.value_counts()
chem['class_size'] = [sizes[k] for k in chem.fine_class]

In [97]:
# for coarse class I would use chem_class[chem_class.main_coarse_class != 'Thermometers']... etc.

In [98]:
# Then I merge it with predictions
data = data.merge(chem, on='name_short', how='right')

In [99]:
# First step is to  aggregate per metabolite, dataset and axes values

# Let's use matrix and class as example axes values
X='Matrix short'
Y='fine_class'

step1 = data.pivot_table(index=['dataset_id', 'name_short', X, Y],
            values=['effective_intensity', 'detectability', 'class_size'],
            aggfunc=
                {'effective_intensity':'sum',
                'detectability':'max',
                'class_size':'first'},
            fill_value=0)

step1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,class_size,detectability,effective_intensity
dataset_id,name_short,Matrix short,fine_class,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-06-18_10h37m54s,2-Oxoglutaric acid,9AA,Keto acid,4,True,65704.24395
2021-06-18_10h37m54s,3-Hydroxyanthranilic acid,9AA,Acidic amino acids,8,True,8473.933785
2021-06-18_10h37m54s,3-Phosphoglyceric acid,9AA,Carboxylic acid phosphate,3,True,164535.04748
2021-06-18_10h37m54s,"4,5-Dihydroorotic acid",9AA,Nucleobases and analogs,10,True,110215.13376
2021-06-18_10h37m54s,4-Hydroxyproline,9AA,Polar amino acids,10,True,2378.876126


In [100]:
# Next, aggregare per dataset and axes values

step2 = step1.pivot_table(index=['dataset_id', X,Y],
                      values=['effective_intensity', 'detectability', 'class_size'],
                      aggfunc = {
                                'class_size':'first',
                                'effective_intensity':'mean', # only when considering only 'detected' data
                                'detectability':'sum'
                                },
                      fill_value=0)

step2['fraction_detected'] = step2.detectability / step2.class_size
step2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,class_size,detectability,effective_intensity,fraction_detected
dataset_id,Matrix short,fine_class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-06-18_10h37m54s,9AA,Acidic amino acids,8,7,65789.609979,0.875
2021-06-18_10h37m54s,9AA,Arginine derivatives (guanidines),5,4,3334.858593,0.8
2021-06-18_10h37m54s,9AA,Aromatic acids,3,3,27943.51053,1.0
2021-06-18_10h37m54s,9AA,Aromatic amino acids,6,6,654.876302,1.0
2021-06-18_10h37m54s,9AA,Carbohydrate amines,2,2,2873.306163,1.0


In [101]:
# Finally, take the average of results of all datasets

step3 = step2.groupby([X, Y]).agg({
                                'effective_intensity' : 'mean',
                                'fraction_detected' : 'mean'})

step3['log10_intensity'] = step3['effective_intensity'].apply(lambda x: np.log10(x+1))
step3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,effective_intensity,fraction_detected,log10_intensity
Matrix short,fine_class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9AA,Acidic amino acids,34563.441418,0.6875,4.53863
9AA,Arginine derivatives (guanidines),5204.756144,0.9,3.716484
9AA,Aromatic acids,14517.407365,0.666667,4.161919
9AA,Aromatic amino acids,398.639074,0.75,2.601668
9AA,Carbohydrate amines,1776.816771,0.75,3.249887
