Import libraries

In [None]:
import os 
import collections
import numpy as np
from tqdm import tqdm
from msbuddy import Msbuddy
from msbuddy.base import MetaFeature, Spectrum

from utils import load_pickle
import matplotlib.pyplot as plt

# from matchms import Spectrum
from matchms import calculate_scores
from matchms.similarity import CosineGreedy

Helper Functions

In [24]:
def group_molecules_by_experimental_conditions(data_list):

    data_by_experimental_conditions = {} 

    for spec in tqdm(data_list): 

        inchikey = spec.metadata["inchikey"][:14] # Only keep the 2d information
        instrument = spec.metadata["instrument_type"]
        adduct = spec.metadata["adduct"]
        energy = spec.metadata["collision_energy"]

        if inchikey not in data_by_experimental_conditions:data_by_experimental_conditions[inchikey] = {}
        if instrument not in data_by_experimental_conditions[inchikey]: data_by_experimental_conditions[inchikey][instrument] = {}
        if adduct not in data_by_experimental_conditions[inchikey][instrument]: data_by_experimental_conditions[inchikey][instrument][adduct] = {} 
        if energy not in data_by_experimental_conditions[inchikey][instrument][adduct]: data_by_experimental_conditions[inchikey][instrument][adduct][energy] = []

        data_by_experimental_conditions[inchikey][instrument][adduct][energy].append(spec)
    
    return data_by_experimental_conditions

Load in the data

In [3]:
# Some settings 
greedy_cs = CosineGreedy()
cache_folder = "./cache"
output_folder = "./results"
massspecgym_list_path = os.path.join(cache_folder,"massspecgym_list.pkl")
massspecgym_list = load_pickle(massspecgym_list_path)

# Load in the data 
NIST2023_list_path = os.path.join(cache_folder,"NIST2023_list.pkl")
NIST2023_list = load_pickle(NIST2023_list_path)

Group data by experimental conditions

In [25]:
massspecgym_grouped = group_molecules_by_experimental_conditions(massspecgym_list)
NIST2023_grouped = group_molecules_by_experimental_conditions(NIST2023_list)

100%|██████████| 121746/121746 [00:50<00:00, 2395.70it/s]
100%|██████████| 1344134/1344134 [09:34<00:00, 2338.50it/s]


Get pairwise similarity between LC-MS/MS

In [51]:
massspecgym_all_min_scores = [] 
massspecgym_group_size = []
massspecgym_n_repeats, massspecgym_n_combi = 0,0  

for inchikey, rec in tqdm(massspecgym_grouped.items()):
    for instrument, rec in rec.items():
        for adduct, rec in rec.items():
            for energy, rec in rec.items():
                
                massspecgym_n_combi += 1 
                massspecgym_group_size.append(len(rec))
                if len(rec) <= 1: continue
                
                massspecgym_n_repeats += 1 
                scores = calculate_scores(rec, rec, greedy_cs, is_symmetric = True).to_array()
                scores = np.vectorize(lambda x: x[0])(scores)[0]
                min_score = np.mean(scores)
                
                massspecgym_all_min_scores.append(min_score)

NIST2023_all_min_scores = []
NIST2023_group_size = [] 
NIST2023_n_repeats, NIST2023_n_combi = 0,0  

for inchikey, rec in tqdm(NIST2023_grouped.items()):
    for instrument, rec in rec.items():
        for adduct, rec in rec.items():
            for energy, rec in rec.items():
                
                NIST2023_n_combi += 1 
                NIST2023_group_size.append(len(rec))
                if len(rec) <= 1: continue
                
                NIST2023_n_repeats += 1 
                scores = calculate_scores(rec, rec, greedy_cs, is_symmetric = True).to_array()
                scores = np.vectorize(lambda x: x[0])(scores)[0]
                min_score = np.mean(scores)
                
                NIST2023_all_min_scores.append(min_score)

100%|██████████| 17268/17268 [00:12<00:00, 1424.36it/s] 
100%|██████████| 39513/39513 [00:04<00:00, 8989.56it/s] 


Get histogram on the group sizes

In [54]:
print(massspecgym_n_repeats / massspecgym_n_combi * 100)
print(NIST2023_n_repeats / NIST2023_n_combi * 100)

31.329374143444493
0.8810642203765169


In [56]:
print(sum([s for s in massspecgym_group_size if s > 1]) / sum(massspecgym_group_size) * 100)
print(sum([s for s in NIST2023_group_size if s > 1]) / sum(NIST2023_group_size) * 100)

56.785438535968325
1.8660341900435522


Build the cummulation for these scores

In [None]:
# Get the x-axis 
min_score = np.arange(0, 1.0, step = 0.01)[::-1]

# Get the percentage
massspecgym_percent = [len([s for s in massspecgym_all_min_scores if s >= i]) / len(massspecgym_all_min_scores) for i in min_score]
NIST2023_percent = [len([s for s in NIST2023_all_min_scores if s >= i]) / len(NIST2023_all_min_scores) for i in min_score]

# 3. Plot the step function for the empirical CDF
plt.plot(min_score, massspecgym_percent, label = "MassSpecGym", color = "blue")
plt.plot(min_score, NIST2023_percent, label = "NIST2023", color = "green")

# 4. Label the axes
plt.xlabel("Threshold")
plt.xlim([0, 1.0])
plt.ylabel("Percentage of groups")
plt.title("Percentage of groups with score above threshold")

# 6. Show the plot
plt.legend()
plt.savefig(os.path.join(output_folder, "percentage_group.png"))
plt.show()

NameError: name 'np' is not defined

Get percentage based on thresholds

In [102]:
threshold = 0.85

massspecgym_n_below_threshold = len([s for s in massspecgym_all_min_scores if s < threshold])
massspecgym_percent_below_threshold = round(massspecgym_n_below_threshold / len(massspecgym_all_min_scores) * 100, 3)

NIST2023_n_below_threshold = len([s for s in NIST2023_all_min_scores if s < threshold])
nist2023_percent_below_threshold = round(NIST2023_n_below_threshold / len(NIST2023_all_min_scores) * 100, 3)

print(massspecgym_percent_below_threshold, nist2023_percent_below_threshold)

9.224 3.002
