### Clean library using matchms
This notebook runs filtering on the libraries to get harmonized and cleaned spectra. It harmonizes metadata, normalizes peaks and ensures only spectra are stored where metadata matches internally.
Please cite https://doi.org/10.26434/chemrxiv-2023-l44cm and https://doi.org/10.21105/joss.02411 if you reuse code below to process your data. 

## prepare workflow

In [27]:
from typing import List
import logging
import pandas as pd

logger = logging.getLogger("matchms")


In [91]:
def require_adduct_in_list(spectrum, allowed_adduct_list: List[str]):
    """Removes spectra if the adduct is not within the given list"""
    if spectrum is None:
        return None
    if spectrum.get("adduct") not in allowed_adduct_list:
        logger.info("removed spectrum since adduct: %s is not in allowed_adduct_list %s", spectrum.get("adduct"), allowed_adduct_list)
        return None
    return spectrum

In [95]:
def remove_charged_molecules(spectrum):
    if spectrum is None:
        return None
    mol = Chem.MolFromSmiles(spectrum.get("smiles"))
    charge = sum(atom.GetFormalCharge() for atom in mol.GetAtoms())
    if charge == 0:
        return spectrum
    logger.info("removed spectrum since spectrum is charged")
    return None

In [111]:


def require_formula_match_parent_mass(spectrum, tolerance=0.1):
    formula = spectrum.get("formula")
    if formula is None:
        logger.warning("removed spectrum since precursor formula is None")
        return None
    formula = Formula(formula)
    if math.isclose(formula.get_mass(), float(spectrum.get("parent_mass")), abs_tol=tolerance):
        return spectrum
    else:
        logger.info(f"formula = {formula}, parent mass {spectrum.get('parent_mass')}, found mass {formula.get_mass()}")
        logger.info("mass_diff = ", float(spectrum.get("parent_mass")) - formula.get_mass())
    return None


In [32]:
conversions = {'-Maxis HD qTOF': 'ESI-QTOF', '-Q-Exactive Plus Orbitrap Res 14k': 'ESI-QFT', '-Q-Exactive Plus Orbitrap Res 70k': 'ESI-QFT',
 'APCI-Ion Trap': 'APCI-IT', 'APCI-Orbitrap': 'APCI-QFT', 'APCI-QQQ': 'APCI-QQ', 'APCI-qTof': 'APCI-QTOF', 'CI (MeOH)-IT/ion trap': 'CI-IT',
 'CI-IT/ion trap': 'CI-IT', 'DI-ESI-Hybrid FT': 'ESI-QFT', 'DI-ESI-Ion Trap': 'ESI-IT', 'DI-ESI-Orbitrap': 'ESI-QFT',
 'DI-ESI-Q-Exactive Plus': 'ESI-QFT', 'DI-ESI-QQQ': 'ESI-QQ', 'DI-ESI-qTof': 'ESI-QTOF', 
 'DIRECT INFUSION NANOESI-ION TRAP-DIRECT INFUSION NANOESI-ION TRAP': 'ESI-IT', 'ESI or APCI-IT/ion trap': 'ESI-IT',
 'ESI-APCI-ITFT': 'APCI-ITFT', 'ESI-ESI-FTICR': 'ESI-FT', 'ESI-ESI-ITFT': 'ESI-ITFT', 'ESI-FAB-EBEB': 'FAB-EBEB',
 'ESI-Flow-injection QqQ/MS': 'ESI-QQ', 'ESI-HCD': 'ESI-QFT', 'ESI-HPLC-ESI-TOF': 'LC-ESI-TOF', 'ESI-Hybrid FT': 'ESI-QFT',
 'ESI-IT-FT/ion trap with FTMS': 'ESI-ITFT', 'ESI-IT/ion trap': 'ESI-IT', 'ESI-Ion Trap': 'ESI-IT', 'ESI-LC-APPI-QQ': 'LC-APPI-QQ',
 'ESI-LC-ESI-IT': 'LC-ESI-IT', 'ESI-LC-ESI-ITFT': 'LC-ESI-ITFT', 'ESI-LC-ESI-ITTOF': 'LC-ESI-ITTOF', 'ESI-LC-ESI-Q': 'LC-ESI-Q',
 'ESI-LC-ESI-QFT': 'LC-ESI-QFT', 'ESI-LC-ESI-QQ': 'LC-ESI-QQ', 'ESI-LC-ESI-QTOF': 'LC-ESI-QTOF', 'ESI-LC-Q-TOF/MS': 'LC-ESI-QTOF',
 'ESI-Orbitrap': 'ESI-ITFT', 'ESI-Q-TOF': 'ESI-QTOF', 'ESI-QIT': 'ESI-QIT', 'ESI-QQQ': 'ESI-QQ', 'ESI-QqQ': 'ESI-QQ', 'ESI-UPLC-ESI-QTOF': 'LC-ESI-QTOF',
 'ESI-qTOF': 'ESI-QTOF', 'ESI-qToF': 'ESI-QTOF', 'ESI-qTof': 'ESI-QTOF', 'FAB-BEqQ/magnetic and electric sectors with quadrupole': 'FAB-BEQQ',
 'In-source CID-API': 'ESI-QQ', 'LC-APCI-qTof': 'LC-APCI-QTOF', 'LC-ESI- impact HD': 'LC-ESI-QTOF', 'LC-ESI-CID; Lumos': 'LC-ESI-ITFT',
 'LC-ESI-CID; Velos': 'LC-ESI-ITFT', 'LC-ESI-HCD; Lumos': 'LC-ESI-ITFT', 'LC-ESI-HCD; Velos': 'LC-ESI-ITFT', 'LC-ESI-Hybrid FT': 'LC-ESI-QFT',
 'LC-ESI-Hybrid Ft': 'LC-ESI-QFT', 'LC-ESI-ITFT-LC-ESI-ITFT': 'LC-ESI-ITFT', 'LC-ESI-ITTOF-LC-ESI-ITTOF': 'LC-ESI-ITTOF', 'LC-ESI-Ion Trap': 'LC-ESI-IT',
 'LC-ESI-LCQ': 'LC-ESI-IT', 'LC-ESI-Maxis HD qTOF': 'LC-ESI-QTOF', 'LC-ESI-Maxis II HD Q-TOF Bruker': 'LC-ESI-QTOF', 'LC-ESI-Orbitrap': 'LC-ESI-ITFT',
 'LC-ESI-Q-Exactive Plus': 'LC-ESI-QFT', 'LC-ESI-Q-Exactive Plus Orbitrap Res 14k': 'LC-ESI-QFT', 'LC-ESI-Q-Exactive Plus Orbitrap Res 70k': 'LC-ESI-QFT',
 'LC-ESI-QQ-LC-ESI-QQ': 'LC-ESI-QQ', 'LC-ESI-QQQ': 'LC-ESI-QQ', 'LC-ESI-QTOF-LC-ESI-QTOF': 'LC-ESI-QTOF', 'LC-ESI-qTOF': 'LC-ESI-QTOF',
 'LC-ESI-qToF': 'LC-ESI-QTOF', 'LC-ESI-qTof': 'LC-ESI-QTOF', 'LC-ESIMS-qTOF': 'LC-ESI-ITFT', 'N/A-ESI-QFT': 'ESI-QFT', 'N/A-ESI-QTOF': 'ESI-QTOF',
 'N/A-Linear Ion Trap': 'ESI-IT', 'N/A-N/A': 'ESI-QTOF', 'Negative-Quattro_QQQ:10eV': 'ESI-QQ', 'Negative-Quattro_QQQ:25eV': 'ESI-QQ',
 'Negative-Quattro_QQQ:40eV': 'ESI-QQ', 'Positive-Quattro_QQQ:10eV': 'ESI-QQ', 'Positive-Quattro_QQQ:25eV': 'ESI-QQ', 'Positive-Quattro_QQQ:40eV': 'ESI-QQ'}

def harmonize_instrument_types(spectrum, conversions: dict):
    if spectrum is None:
        return None
    spectrum = spectrum.clone()
    instrument_type = spectrum.get("instrument_type")
    if instrument_type in conversions:
        spectrum.set("instrument_type", conversions[instrument_type])
    return spectrum



In [33]:
instrument_types_to_remove = ["LC-ESI-QQ", "Linear Ion Trap", "ESI-QQ", "ESI-IT", "LC-ESI-IT", "QIT", "LC-APPI-QQ",
                              "GC-APCI-QTOF", "QqQ", "LC-ESI-QIT", "FAB-EBEB", "APCI-ITFT", "LC-APCI-QFT",
                              "LC-APCI-ITFT", "LC-ESI-Q", "GC-EI-QQ", "MALDI-TOFTOF", "GC-EI-Q", "MALDI-QITTOF",
                              "LC-APCI-QTOF", "in source CID", "SYNAPT QTOF, Waters", "Q Exactive HF", "APCI-QFT",
                              "APCI-IT", "LIT", "CI-IT", "FAB-BEQQ", "APCI-QQ", "APCI-QTOF", "APCI-ITTOF",
                              "Q Exactive Focus Hybrid Quadrupole Orbitrap Ma...", "ESI-QIT", "MALDI-QIT", "SI-BE",
                              "FAB-BE", "QIT-FT", "Waters SYNAPT",
                              "Thermo LTQ"]

def remove_instrument_types(spectrum, instrument_types_to_remove: list):
    if spectrum is None:
        return None
    instrument_type = spectrum.get("instrument_type")
    if instrument_type in instrument_types_to_remove:
        return None
    return spectrum

In [34]:
from rdkit import Chem
import re
from matchms.filtering.filter_utils.smile_inchi_inchikey_conversions import \
    is_valid_inchi
from matchms.filtering.filter_utils.interpret_unknown_adduct import get_ions_from_adduct, split_ion, get_charge_of_adduct
import math 
class Formula:
    def __init__(self, formula: str):
        self.dict_representation = self.get_atom_and_counts(formula)

    def get_atom_and_counts(self, formula):
        parts = re.findall("[A-Z][a-z]?|[0-9]+", formula)
        atoms_and_counts = {}
        for i, atom in enumerate(parts):
            if atom.isnumeric():
                continue
            multiplier = int(parts[i + 1]) if len(parts) > i + 1 and parts[i + 1].isnumeric() else 1
            if atom in atoms_and_counts.keys():
                atoms_and_counts[atom] += multiplier
                # print(f"Repetition in formula found, {atom} occurs multiple times in {formula}")
            else:
                atoms_and_counts[atom] = multiplier
        return atoms_and_counts
        
    def __add__(self, otherFormula: "Formula"):
        new_formula = Formula("")
        new_formula.dict_representation = self.dict_representation.copy()
        for atom, value in otherFormula.dict_representation.items():
            if atom in new_formula.dict_representation:
                new_formula.dict_representation[atom] += value
            else:
                new_formula.dict_representation[atom] = value
        return new_formula
        
    def __sub__(self, otherFormula: "Formula"):
        new_formula = Formula("")
        new_formula.dict_representation = self.dict_representation.copy()
        for atom, value in otherFormula.dict_representation.items():
            if atom in new_formula.dict_representation:
                new_formula.dict_representation[atom] -= value
                if new_formula.dict_representation[atom] < 0:
                    print(f"Removing an atom {otherFormula} that does not exist in the main formula {str(self)}")
                    return None
            else:
                print(f"Removing an atom {otherFormula} that does not exist in the main formula {str(self)}")
                return None
        return new_formula

    def __mul__(self, multiplication):
        new_formula = Formula("")
        for i in range(multiplication):
            new_formula += self
        return new_formula
        
    def __str__(self):
        # Separate out carbon, hydrogen, and other elements
        carbon_count = self.dict_representation.get('C', 0)
        hydrogen_count = self.dict_representation.get('H', 0)
        
        # Elements except C and H
        other_elements = {k: v for k, v in self.dict_representation.items() if k not in ['C', 'H']}
        
        # Sort other elements alphabetically
        sorted_other_elements = sorted(other_elements.items())
        
        # Build the Hill notation string
        hill_notation = ''
        
        # Add carbon if it exists
        if carbon_count > 0:
            hill_notation += 'C'
            if carbon_count > 1:
                hill_notation += str(carbon_count)
        
        # Add hydrogen if it exists
        if hydrogen_count > 0:
            hill_notation += 'H'
            if hydrogen_count > 1:
                hill_notation += str(hydrogen_count)
        
        # Add other elements
        for elem, count in sorted_other_elements:
            hill_notation += elem
            if count > 1:
                hill_notation += str(count)
        
        return hill_notation

    def get_mass(self):
        mass = 0
        periodic_table = Chem.GetPeriodicTable()
        for atom, value in self.dict_representation.items():
            try:
                atom_mass = periodic_table.GetMostCommonIsotopeMass(atom)
            except RuntimeError:
                print("The atom: %s in the formula %s is not known", atom, formula)
                return None
            mass += atom_mass * value
        return mass

def add_precursor_formula(spectrum):
    if spectrum is None:
        return None
    spectrum = spectrum.clone()
    nr_of_parent_masses, ions_split = get_ions_from_adduct(spectrum.get("adduct"))
    formula_str = spectrum.get('formula')
    if formula_str is None:
        print("No parent mass formula")
        return None
    
    original_precursor_formula = Formula(formula_str)
    new_precursor_formula = Formula("")
    for i in range(nr_of_parent_masses):
        new_precursor_formula += original_precursor_formula
    for ion in ions_split:
        sign, number, formula = split_ion(ion)
        for i in range(number):
            if sign == "+":
                new_precursor_formula += Formula(formula)
            if sign == "-":
                new_precursor_formula -= Formula(formula)
            if new_precursor_formula is None:
                return spectrum
    spectrum.set("precursor_formula", str(new_precursor_formula))
    return spectrum

In [35]:
fields_to_keep = ("smiles", "inchi", "inchikey", "precursor_mz", "adduct", "parent_mass", "formula", "precursor_formula", "instrument_type", "collision_energy") 

def store_relevant_metadata_only(spectrum_in, fields_to_keep: list):
    if spectrum_in is None:
        return None
    spectrum = spectrum_in.clone()
    metadata = spectrum.metadata
    filtered_metadata = {}
    for key in fields_to_keep:
        if key in metadata:
            filtered_metadata[key] = metadata[key]
        else:
            filtered_metadata[key] = None
    spectrum.metadata = filtered_metadata
    return spectrum


## Filters now in matchms
Filters that are now also available in matchms 0.26.0

In [60]:
import logging
import math
from matchms.filtering.filter_utils.interpret_unknown_adduct import \
    get_multiplier_and_mass_from_adduct


logger = logging.getLogger("matchms")


def require_matching_adduct_precursor_mz_parent_mass(spectrum,
                                                     tolerance=0.1):
    """Checks if the adduct precursor mz and parent mass match within the tolerance"""
    if spectrum is None:
        return None

    adduct = spectrum.get("adduct")

    if adduct is None:
        logger.info("Spectrum is removed since adduct is None")
        return None
    if spectrum.get("parent_mass") is None:
        logger.info("Spectrum is removed since parent mass is None")
        return None
    if spectrum.get("precursor_mz") is None:
        logger.info("Spectrum is removed since precursor mz is None")
        return None
    try:
        precursor_mz = float(spectrum.get("precursor_mz"))
        parent_mass = float(spectrum.get("parent_mass"))
    except (TypeError, ValueError):
        logger.warning("precursor_mz or parent mass could not be converted to float, "
                       "please run add_parent_mass and add_precursor_mz first")
        return spectrum

    multiplier, correction_mass = get_multiplier_and_mass_from_adduct(adduct)
    if multiplier is None:
        logger.info("Spectrum is removed since adduct: %s could not be parsed", adduct)
        return None
    expected_parent_mass = (precursor_mz - correction_mass) / multiplier
    if not math.isclose(parent_mass, expected_parent_mass, abs_tol=tolerance):
        logger.info("Spectrum is removed because the adduct : %s and precursor_mz: %s suggest a parent mass of %s, "
                    "but parent mass %s is given",
                    adduct, precursor_mz, expected_parent_mass, parent_mass)
        return None
    return spectrum

In [58]:
import logging
from matchms.filtering.filter_utils.interpret_unknown_adduct import \
    get_charge_of_adduct


logger = logging.getLogger("matchms")


def require_matching_adduct_and_ionmode(spectrum):
    if spectrum is None:
        return None
    ionmode = spectrum.get("ionmode")
    adduct = spectrum.get("adduct")
    charge_of_adduct = get_charge_of_adduct(adduct)
    if charge_of_adduct is None:
        return None
    if (charge_of_adduct > 0 and ionmode != "positive") or (charge_of_adduct < 0 and ionmode != "negative"):
        logger.warning("Ionmode: %s does not correspond to the charge or the adduct %s", ionmode, adduct)
        return None
    return spectrum

In [62]:
import logging
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula


logger = logging.getLogger("matchms")


def derive_formula_from_smiles(spectrum_in, overwrite=True):
    if spectrum_in is None:
        return None
    spectrum = spectrum_in.clone()
    if spectrum.get("formula") is not None:
        if overwrite is False:
            return spectrum

    formula = _get_formula_from_smiles(spectrum.get("smiles"))

    if formula is not None:
        if spectrum.get("formula") is not None:
            if spectrum.get("formula") != formula:
                logger.info("Overwriting formula from inchi. Original formula: %s New formula: %s",
                            spectrum.get('formula'), formula)
                spectrum.set("formula", formula)
        else:
            logger.info("Added formula from inchi. New Formula: %s", formula)
            spectrum.set("formula", formula)
    else:
        logger.warning("The smiles: %s could not be interpreted by rdkit, so no formula was set")
    return spectrum


def _get_formula_from_smiles(smiles):
    if smiles is None:
        return None
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return CalcMolFormula(mol)

In [64]:
import logging
from collections import Counter
from matchms.Fragments import Fragments
from matchms.Spectrum import Spectrum


logger = logging.getLogger("matchms")


def remove_noise_below_frequent_intensities(spectrum: Spectrum,
                                            min_count_of_frequent_intensities: int = 5,
                                            noise_level_multiplier: float = 2.0):
    """Removes noise if intensities exactly match frequently
    When no noise filtering has been applied to a spectrum, many spectra with have repeating intensities.
    From all intensities that repeat more than min_count_of_frequent_intensities the highest is selected.
    The noise level is set to this intensity * noise_level_multiplier. All fragments with an intensity below the noise
    level are removed.
    This filter was suggested by Tytus Mak.
    Parameters
    ----------
    spectrum
        Input spectrum.
    min_count_of_frequent_intensities:
        Minimum number of repeating intensities.
    noise_level_multiplier:
        From all intensities that repeat more than min_count_of_frequent_intensities the highest is selected.
    The noise level is set to this intensity * noise_level_multiplier.
    """
    if spectrum is None:
        return None
    spectrum = spectrum.clone()

    highest_frequent_peak = _select_highest_frequent_peak(spectrum.intensities, min_count_of_frequent_intensities)
    if highest_frequent_peak != -1:
        noise_threshold = highest_frequent_peak * noise_level_multiplier
        peaks_to_keep = spectrum.intensities > noise_threshold
        new_mzs, new_intensities = spectrum.mz[peaks_to_keep], spectrum.intensities[peaks_to_keep]
        spectrum.peaks = Fragments(mz=new_mzs, intensities=new_intensities)
        logger.info("Fragments removed with intensity below %s", noise_threshold)
    return spectrum


def _select_highest_frequent_peak(intensities,
                                  min_count_of_frequent_intensities=5):
    counts = Counter(intensities)
    highest_value_to_remove = -1
    for value, count in counts.items():
        if count >= min_count_of_frequent_intensities:
            if value > highest_value_to_remove:
                highest_value_to_remove = value
    return highest_value_to_remove

In [66]:
import logging
from typing import Optional
from matchms.Spectrum import Spectrum


logger = logging.getLogger("matchms")


def require_number_of_peaks_below_maximum(spectrum_in: Spectrum,
                                          maximum_number_of_fragments: int = 1000) -> Optional[Spectrum]:
    """Spectrum will be set to None when it has more peaks than maximum_number_of_fragments.
    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    maximum_number_of_fragments:
        Number of minimum required peaks. Spectra with fewer peaks will be set
        to 'None'.
    """
    if spectrum_in is None:
        return None

    if spectrum_in.peaks.intensities.size > maximum_number_of_fragments:
        logger.info("Spectrum with %s (>%s) peaks was set to None.",
                    str(spectrum_in.peaks.intensities.size), str(maximum_number_of_fragments))
        return None

    return spectrum_in

In [67]:
def remove_not_ms2_spectra(spectrum):
    if spectrum.get("ms_level") in ("MS2", "2"):
        return spectrum
    if spectrum.get("ms_level") is None:
        if spectrum.get("ms_type") in ("MS2", "2"):
            return spectrum
    return None

## Running matchms pipeline
The code below runs the pipeline. It consists of matchms filters that already existed and newly added filters above.

In [120]:
import os
from matchms.Pipeline import Pipeline, create_workflow

results_folder = "./results_library_cleaning"
os.makedirs(results_folder, exist_ok=True)
yaml_file_name=os.path.join(results_folder, "metadata_cleaning.yaml")

workflow = create_workflow(
    # yaml_file_name=yaml_file_name,
    query_filters=DEFAULT_FILTERS + REQUIRE_COMPLETE_ANNOTATION +
                  [(msfilters.repair_smiles_of_salts, {"mass_tolerance": 0.1}),
                   (msfilters.repair_adduct_based_on_smiles, {"mass_tolerance": 0.1}),
                   (msfilters.derive_annotation_from_compound_name, {"annotated_compound_names_file": os.path.join(results_folder, "annotated_compound_names.csv")}),
                   msfilters.repair_not_matching_annotation, 
                  (msfilters.require_minimum_number_of_peaks, {"n_required": 1}),
                   (msfilters.require_correct_ionmode, {"ion_mode_to_keep": "positive"})
                  ])

pipeline = Pipeline(workflow)
pipeline.processing_queries.parse_and_add_filter(remove_not_ms2_spectra, filter_position=0)

pipeline.processing_queries.parse_and_add_filter((require_adduct_in_list, {"allowed_adduct_list": ["[M+H]+", "[M+Na]+"]}))
pipeline.processing_queries.parse_and_add_filter(remove_charged_molecules)

pipeline.processing_queries.parse_and_add_filter(require_matching_adduct_precursor_mz_parent_mass)
pipeline.processing_queries.parse_and_add_filter(require_matching_adduct_and_ionmode)

pipeline.processing_queries.parse_and_add_filter(derive_formula_from_smiles)
pipeline.processing_queries.parse_and_add_filter(require_formula_match_parent_mass)
pipeline.processing_queries.parse_and_add_filter(add_precursor_formula)

pipeline.processing_queries.parse_and_add_filter((harmonize_instrument_types, {"conversions": conversions}))
pipeline.processing_queries.parse_and_add_filter((remove_instrument_types, {"instrument_types_to_remove": instrument_types_to_remove}))

pipeline.processing_queries.parse_and_add_filter((remove_noise_below_frequent_intensities, {"min_count_of_frequent_intensities": 6}), 
                                                 filter_position=1)
pipeline.processing_queries.parse_and_add_filter((require_number_of_peaks_below_maximum, {"maximum_number_of_fragments": 1000}))

pipeline.processing_queries.parse_and_add_filter((store_relevant_metadata_only, {"fields_to_keep": fields_to_keep}))

final_filter_order = [filter.__name__ for filter in pipeline.processing_queries.filters]
final_filter_order



['remove_not_ms2_spectra',
 'remove_noise_below_frequent_intensities',
 'make_charge_int',
 'add_compound_name',
 'derive_adduct_from_name',
 'derive_formula_from_name',
 'clean_compound_name',
 'interpret_pepmass',
 'add_precursor_mz',
 'add_retention_index',
 'add_retention_time',
 'derive_ionmode',
 'correct_charge',
 'require_precursor_mz',
 'harmonize_undefined_inchikey',
 'harmonize_undefined_inchi',
 'harmonize_undefined_smiles',
 'repair_inchi_inchikey_smiles',
 'clean_adduct',
 'add_parent_mass',
 'derive_annotation_from_compound_name',
 'derive_smiles_from_inchi',
 'derive_inchi_from_smiles',
 'derive_inchikey_from_inchi',
 'repair_smiles_of_salts',
 'repair_adduct_based_on_smiles',
 'repair_not_matching_annotation',
 'require_valid_annotation',
 'require_correct_ionmode',
 'require_parent_mass_match_smiles',
 'normalize_intensities',
 'require_minimum_number_of_peaks',
 'require_adduct_in_list',
 'remove_charged_molecules',
 'require_matching_adduct_precursor_mz_parent_mass'

In [None]:
# test run 
from matchms.yaml_file_functions import load_workflow_from_yaml_file

pipeline.logging_file = os.path.join(results_folder, "metadata_cleaning.log")  # for pipeline and logging message
pipeline.logging_level = "WARNING"  # To define the verbosety of the logging
processing_report = pipeline.run("unprocessed_libraries/merged_libraries.mgf",
                                 cleaned_query_file=os.path.join(results_folder, "cleaned_libraries.mgf"))

Processing spectrums: 126562it [1:48:24,  6.72it/s] 

In [122]:
print(processing_report)

----- Spectrum Processing Report -----
Number of spectrums processed: 1334962
Number of spectrums removed: 885241
Changes during processing:
                                                  removed spectra  changed metadata  changed mass spectrum
filter                                                                                                    
add_parent_mass                                                 0            699187                      0
add_precursor_formula                                           0            464082                      0
add_retention_index                                             0            706484                      0
add_retention_time                                              0            552948                      0
clean_adduct                                                    0              8102                      0
clean_compound_name                                             0            104201                      0
cor

In [123]:
print(len(pipeline._spectrums_queries))

449721
