Import libraries

In [2]:
import os
import pickle
import numpy as np
from tqdm import tqdm
from pprint import pprint

from msbuddy.base import MetaFeature, Spectrum
from msbuddy import assign_subformula, Msbuddy, MsbuddyConfig

Helper Functions

In [3]:
def load_pickle(path):

    with open(path, "rb") as f:
        return pickle.load(f)

In [4]:
def pickle_data(data, path):
    with open(path, "wb") as f:
        pickle.dump(data, f)

In [5]:
def get_results(subformla_list):

    subformula_string_list = [] 

    for subformula in subformla_list:
        
        subformula = subformula.subform_list
        if len(subformula) == 0: 
            subformula_string_list.append("")
        else:
            subformula_string_list.append(subformula[0].formula)
    
    return subformula_string_list
    

Settings


In [6]:
data_folder = "/data/rbg/users/klingmin/projects/MS_processing/benchmarks/massspec_gym/noisy_lookup/"
frags_folder = "/data/rbg/users/klingmin/projects/MS_processing/benchmarks/massspec_gym/noisy_lookup/frag_trees/noisy_set"

noisy_set = load_pickle(os.path.join(data_folder, "noisy_set.pkl"))
clean_set = load_pickle(os.path.join(data_folder, "clean_set.pkl"))
DB = load_pickle(os.path.join(data_folder, "DB.pkl"))

Check how well we can get the chemical formula

In [7]:
msb_config = MsbuddyConfig(ms_instr = "orbitrap", 
                           ppm = True,
                           ms1_tol = 5,
                           ms2_tol = 10,
                           halogen = True,
                           timeout_secs = 200)
msb_engine = Msbuddy(msb_config)

msbuddy: molecular formula annotation for MS-based small molecule analysis.
Developed and maintained by Shipei Xing.


In [None]:
    subformla_list = assign_subformula(rec.mz,
                                       precursor_formula = formula,
                                       adduct = rec.metadata["adduct"])
    
    subformla_list = get_results(subformla_list)

Get formula

In [8]:
all_features_noisy_set = [] 

for rec in tqdm(noisy_set):

    # create a Spectrum object
    ms2_spec = Spectrum(mz_array = np.array(rec.mz),
                        int_array = np.array(rec.intensities))

    # create a MetaFeature object
    metafeature = MetaFeature(identifier = rec.metadata["identifier"],
                              mz = rec.metadata["precursor_mz"],
                              rt = None,
                              charge = 1,
                              adduct= rec.metadata["adduct"],
                              ms2 = ms2_spec)
    
    # Add meta features 
    all_features_noisy_set.append(metafeature)

# Get the formula 
msb_engine.add_data(all_features_noisy_set)
msb_engine.annotate_formula()

100%|██████████| 1380/1380 [00:00<00:00, 3775.17it/s]


1380 queries loaded.
2 batches in total.
Batch 1/2:
Candidate space generation: 100%|[32m██████████[0m| 1000/1000 [01:47<00:00,  9.31it/s]
Subformula assignment: 100%|[32m██████████[0m| 1000/1000 [32:19<00:00,  1.94s/it]
Candidate formula ranking...
FDR calculation: 100%|[32m██████████[0m| 1000/1000 [00:00<00:00, 9054.38it/s]
Batch 2/2:
Candidate space generation: 100%|[32m██████████[0m| 380/380 [01:26<00:00,  4.41it/s]
Subformula assignment: 100%|[32m██████████[0m| 380/380 [25:06<00:00,  3.96s/it] 
Candidate formula ranking...
FDR calculation: 100%|[32m██████████[0m| 380/380 [00:00<00:00, 4166.71it/s]
Job finished.


In [13]:
noisy_results = msb_engine.get_summary()
pickle_data(noisy_results, os.path.join(data_folder, "noisy_set_formula.pkl"))

In [None]:
all_features_clean_set = [] 

for rec in tqdm(clean_set):

    # create a Spectrum object
    ms2_spec = Spectrum(mz_array = np.array(rec.mz),
                        int_array = np.array(rec.intensities))

    # create a MetaFeature object
    metafeature = MetaFeature(identifier = rec.metadata["identifier"],
                              mz = rec.metadata["precursor_mz"],
                              rt = None,
                              charge = 1,
                              adduct= rec.metadata["adduct"],
                              ms2 = ms2_spec)
    
    # Add meta features 
    all_features_clean_set.append(metafeature)

# Get the formula 
msb_engine.add_data(all_features_clean_set)
msb_engine.annotate_formula()

clean_results = msb_engine.get_summary()
pickle_data(clean_results, os.path.join(data_folder, "clean_set_formula.pkl"))

100%|██████████| 67736/67736 [00:19<00:00, 3400.48it/s]


67736 queries loaded.
68 batches in total.
Batch 1/68:
Candidate space generation: 100%|[32m██████████[0m| 1000/1000 [01:27<00:00, 11.42it/s]
Subformula assignment:  11%|[32m█         [0m| 112/1000 [01:19<25:17,  1.71s/it] 

Save data

In [None]:
results = msb_engine.get_summary()[0]
formula = results["formula_rank_1"]


    # Set the subformula 
    rec.set("pred_formula", formula)
    rec.set("subformula", subformla_list)

    noisy_set_added.append(rec)