In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from definitions import ROOT_DIR
import seaborn as sns

In [2]:
def calculate_detected_intensities(df, threshold=0.8):
    '''
    Make a column with background corrected intensities for detected compounds, and 0s for not detected compounds
    Change any negative values to zero
    Also add detectability column, where compounds with prediction value above threshold=0.8 are labelled as detected (1)
    '''

    df['detectability'] = df.pred_val >= threshold
    vals = df.spot_intensity_bgr_corrected * df.detectability
    df['effective_intensity'] = np.clip(vals, 0, None)
    return df

def get_class_size(metadata, class_column):
    sizes = metadata[class_column].value_counts()
    metadata['class_size'] = [sizes[k] for k in metadata[class_column]]
    return metadata

In [3]:
p_root_dir = Path(ROOT_DIR).parents[0]
p_data = p_root_dir / "5_data_analysis"

# Metrics and Catboost predictions for all ions in their target wells
# p_predictions = p_data / "all_predictions_curated_11-Dec-2021.csv"
p_predictions = p_data / "toy_example_missing_data.csv"

# Dataset info (lab, matrix, polarity, m/z range, ids, etc.)
p_datasets = p_data / "Datasets_14Jul2022.csv"

# Classification
p_chem_class = p_data / "custom_classification_v2.csv"
p_pathways = p_data / "pathways_v2.csv"

In [5]:
# Load predictions, format neutral loss column
predictions = pd.read_csv(p_predictions, index_col=0)
# [['dataset_id', 
#                                                        'name_short', 
#                                                        'adduct', 
#                                                        'neutral_loss', 
#                                                        'pred_val',
#                                                        'spot_intensity_bgr_corrected']]
# predictions.neutral_loss.fillna('', inplace=True)

# # Add dataset metadata 
# datasets = pd.read_csv(p_datasets)
# datasets_info = datasets.groupby('Dataset ID').first()[['Polarity', 'Matrix short', 'Matrix long', 'Slide code', 'EMBL']]
# datasets_info['sample_name'] = datasets_info['Matrix short']

# # Merge with predictions
# df = pd.merge(predictions, datasets_info, left_on='dataset_id', right_on='Dataset ID', how='left')

# # Filter to keep only datasets chosen for plots about matrix comparison
# df = df[df.EMBL]

# # only consider data of detected ions
# df = calculate_detected_intensities(df, threshold=0.8)
# data = df[df.detectability]

In [None]:
# Load classification, add class size info
chem_classification_file = pd.read_csv(p_chem_class, index_col='internal_id')
chem_class = get_class_size(chem_classification_file[['name_short', 'main_coarse_class']].drop_duplicates(), 
                            'main_coarse_class')
chem_subclass = get_class_size(chem_classification_file[['name_short', 'fine_class']], 
                              'fine_class')

pathway_classification_file = pd.read_csv(p_pathways, index_col='internal_id')
pathway_class= get_class_size(pathway_classification_file[['name_short', 'main_coarse_path']].drop_duplicates(),
                         'main_coarse_path')
pathway_subclass = get_class_size(pathway_classification_file[['name_short','fine_path']],
                                  'fine_path')

### Here you would also have relevant filtering steps

## Example for any axis values except class, subclass, pathway and pathway subclass

In [None]:
# Aggregate data from individual ions per metabolite ('name_short'), per dataset ('dataset_id') and axis values

# for example for these axes
X='adduct'
Y='Polarity'

step1 = data.pivot_table(index=['dataset_id', 'name_short', X, Y],
            values=['effective_intensity', 'detectability'],
            aggfunc=
                {'effective_intensity':sum,
                'detectability':max})

step1.head()

In [None]:
# Aggregate data per dataset and axis values
# Calculate what fraction metabolites in this dataset were detected with a given X, Y axis value
# There are 172 metaboites in total

n_metabolites = df.name_short.nunique()
step2 = step1.groupby(['dataset_id', X, Y]).agg({'detectability': lambda x: sum(x)/n_metabolites,
                                                'effective_intensity':'mean'
                                                }) 


step2.head()

In [None]:
# Finally, take the average of results of all datasets

step3 = step2.groupby([X,Y]).agg({
                                'effective_intensity' : 'mean',
                                'detectability' : 'mean'})

step3.rename(columns={'detectability':'fraction_detected'}, inplace=True)
step3['log10_intensity'] = np.log10(step3['effective_intensity']+1)
step3.head()

In [None]:
step3

## Example for when you have class in axes

- chem_class.main_chem_class are chemical classes
- chem_subclass.fine_class are chemical subclasses
- pathways.main_coarse_path is the main pathway
- pathway_subclass.fine_path is the pathway "subclass"

In [None]:
# Then I merge it with predictions
data = data.merge(chem_class, on='name_short', how='right')

In [None]:
# chem_class goes with 'main_coarse_class'
# chem_subclass goes with 'fine_class'
# pathway_class goes with 'coarse_path'
# pathway_subclass goes with 'fine_path'

In [6]:
data = predictions

In [10]:
data.reset_index(inplace=True)

In [25]:
#TOY 
# First step is to  aggregate per metabolite, dataset and axes values

# Let's use matrix and class as example axes values
X='Polarity'
Y='class'

step1 = data.pivot_table(index=['Dataset', X, Y, 'name_short'],
            values=['Intensity'],
            aggfunc=
                {'Intensity':'sum'},
            fill_value=0)

step1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Intensity
Dataset,Polarity,class,name_short,Unnamed: 4_level_1
9AA+,pos,acids,acetate,50
9AA-,neg,acids,acetate,265
9AA-,neg,carbs,fructose,1
9AA-,neg,carbs,glucose,35
DHB+,pos,acids,acetate,18
DHB+,pos,carbs,fructose,100
DHB+,pos,carbs,glucose,125
DHB-,neg,acids,acetate,65
DHB-,neg,carbs,glucose,85


In [26]:
#TOY

# Next, aggregare per dataset and axes values

step2 = step1.pivot_table(index=['Dataset', X,Y],
                      values=['Intensity'],
                      aggfunc = {
                                'Intensity':'mean', # only when considering only 'detected' data
                                },
                      fill_value=0)

step2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Intensity
Dataset,Polarity,class,Unnamed: 3_level_1
9AA+,pos,acids,50.0
9AA-,neg,acids,265.0
9AA-,neg,carbs,18.0
DHB+,pos,acids,18.0
DHB+,pos,carbs,112.5
DHB-,neg,acids,65.0
DHB-,neg,carbs,85.0


In [23]:
#TOY
# Finally, take the average of results of all datasets

step3 = step2.groupby([X, Y]).agg({
                                'Intensity' : 'mean'})

# step3['log10_intensity'] = step3['effective_intensity'].apply(lambda x: np.log10(x+1))
step3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Intensity
Polarity,class,Unnamed: 2_level_1
neg,acids,165.0
neg,carbs,51.5
pos,acids,34.0
pos,carbs,112.5


In [27]:
step3

Unnamed: 0_level_0,Unnamed: 1_level_0,Intensity
Polarity,class,Unnamed: 2_level_1
neg,acids,165.0
neg,carbs,51.5
pos,acids,34.0
pos,carbs,112.5


In [None]:
# First step is to  aggregate per metabolite, dataset and axes values

# Let's use matrix and class as example axes values
X='Matrix short'
Y='main_coarse_class'

step1 = data.pivot_table(index=['dataset_id', 'name_short', X, Y],
            values=['effective_intensity', 'detectability', 'class_size'],
            aggfunc=
                {'effective_intensity':'sum',
                'detectability':'max',
                'class_size':'first'},
            fill_value=0)

step1.head()

In [None]:
# Next, aggregare per dataset and axes values

step2 = step1.pivot_table(index=['dataset_id', X,Y],
                      values=['effective_intensity', 'detectability', 'class_size'],
                      aggfunc = {
                                'class_size':'first',
                                'effective_intensity':'mean', # only when considering only 'detected' data
                                'detectability':'sum'
                                },
                      fill_value=0)

step2['fraction_detected'] = step2.detectability / step2.class_size
step2.head()

In [None]:
# Finally, take the average of results of all datasets

step3 = step2.groupby([X, Y]).agg({
                                'effective_intensity' : 'mean',
                                'fraction_detected' : 'mean'})

step3['log10_intensity'] = step3['effective_intensity'].apply(lambda x: np.log10(x+1))
step3.head()