## Import libraries

In [23]:
import os 
import logging
import collections
from tqdm import tqdm
from config import processed_data_folder
from matchms.importing import load_from_mgf, load_from_msp

logging.disable(logging.WARN)

## Settings

In [97]:
instruments_mapping = {'-Maxis HD qTOF': 'ESI-QTOF', 'ESI-QTOF': 'ESI-QTOF', '-Q-Exactive Plus Orbitrap Res 14k': 'ESI-QFT', '-Q-Exactive Plus Orbitrap Res 70k': 'ESI-QFT',
                       'APCI-Ion Trap': 'APCI-IT', 'APCI-Orbitrap': 'APCI-QFT', 'APCI-QQQ': 'APCI-QQ', 'APCI-qTof': 'APCI-QTOF', 'CI (MeOH)-IT/ion trap': 'CI-IT',
                       'CI-IT/ion trap': 'CI-IT', 'DI-ESI-Hybrid FT': 'ESI-QFT', 'DI-ESI-Ion Trap': 'ESI-IT', 'DI-ESI-Orbitrap': 'ESI-QFT',
                       'DI-ESI-Q-Exactive Plus': 'ESI-QFT', 'DI-ESI-QQQ': 'ESI-QQ', 'DI-ESI-qTof': 'ESI-QTOF', 
                       'DIRECT INFUSION NANOESI-ION TRAP-DIRECT INFUSION NANOESI-ION TRAP': 'ESI-IT', 'ESI or APCI-IT/ion trap': 'ESI-IT',
                       'APCI-ITFT': 'APCI-ITFT', 'LC-APCI-ITFT': 'APCI-ITFT', 'ESI-APCI-ITFT': 'APCI-ITFT', 'ESI-ESI-FTICR': 'ESI-FT', 'ESI-ESI-ITFT': 'ESI-ITFT', 'ESI-FAB-EBEB': 'FAB-EBEB',
                       'ESI-Flow-injection QqQ/MS': 'ESI-QQ', 'ESI-HCD': 'ESI-QFT', 'ESI-HPLC-ESI-TOF': 'LC-ESI-TOF', 'ESI-Hybrid FT': 'ESI-QFT',
                       'ESI-IT-FT/ion trap with FTMS': 'ESI-ITFT', 'ESI-IT/ion trap': 'ESI-IT', 'ESI-Ion Trap': 'ESI-IT', 'ESI-LC-APPI-QQ': 'LC-APPI-QQ',
                       'LC-ESI-IT': 'LC-ESI-IT', 'ESI-LC-ESI-IT': 'LC-ESI-IT', 'ESI-LC-ESI-ITFT': 'LC-ESI-ITFT', 'LC-ESI-ITFT': 'LC-ESI-ITFT', 'ESI-LC-ESI-ITTOF': 'LC-ESI-ITTOF', 'ESI-LC-ESI-Q': 'LC-ESI-Q',
                       'LC-ESI-QFT':'LC-ESI-QFT', 'ESI-LC-ESI-QFT': 'LC-ESI-QFT', 'LC-ESI-QQ':'LC-ESI-QQ', 'ESI-LC-ESI-QQ': 'LC-ESI-QQ', 'ESI-LC-ESI-QTOF': 'LC-ESI-QTOF', 'LC-ESI-QTOF': 'LC-ESI-QTOF', 'ESI-LC-Q-TOF/MS': 'LC-ESI-QTOF',
                       'ESI-Orbitrap': 'ESI-ITFT', 'ESI-Q-TOF': 'ESI-QTOF', 'ESI-QIT': 'ESI-QIT', 'ESI-QQQ': 'ESI-QQ', 'ESI-QqQ': 'ESI-QQ', 'ESI-UPLC-ESI-QTOF': 'LC-ESI-QTOF',
                       'ESI-qTOF': 'ESI-QTOF', 'ESI-qToF': 'ESI-QTOF', 'ESI-qTof': 'ESI-QTOF', 'FAB-BEqQ/magnetic and electric sectors with quadrupole': 'FAB-BEQQ',
                       'In-source CID-API': 'ESI-QQ', 'LC-APCI-qTof': 'LC-APCI-QTOF', 'LC-ESI- impact HD': 'LC-ESI-QTOF', 'LC-ESI-CID; Lumos': 'LC-ESI-ITFT',
                       'LC-ESI-CID; Velos': 'LC-ESI-ITFT', 'LC-ESI-HCD; Lumos': 'LC-ESI-ITFT', 'LC-ESI-HCD; Velos': 'LC-ESI-ITFT', 'LC-ESI-Hybrid FT': 'LC-ESI-QFT',
                       'LC-ESI-Hybrid Ft': 'LC-ESI-QFT', 'LC-ESI-ITFT-LC-ESI-ITFT': 'LC-ESI-ITFT', 'LC-ESI-ITTOF-LC-ESI-ITTOF': 'LC-ESI-ITTOF', 'LC-ESI-Ion Trap': 'LC-ESI-IT',
                       'LC-ESI-LCQ': 'LC-ESI-IT', 'LC-ESI-Maxis HD qTOF': 'LC-ESI-QTOF', 'LC-ESI-Maxis II HD Q-TOF Bruker': 'LC-ESI-QTOF', 'LC-ESI-Orbitrap': 'LC-ESI-ITFT',
                       'LC-ESI-Q-Exactive Plus': 'LC-ESI-QFT', 'LC-ESI-Q-Exactive Plus Orbitrap Res 14k': 'LC-ESI-QFT', 'LC-ESI-Q-Exactive Plus Orbitrap Res 70k': 'LC-ESI-QFT',
                       'LC-ESI-QQ-LC-ESI-QQ': 'LC-ESI-QQ', 'LC-ESI-QQQ': 'LC-ESI-QQ', 'LC-ESI-QTOF-LC-ESI-QTOF': 'LC-ESI-QTOF', 'LC-ESI-qTOF': 'LC-ESI-QTOF',
                       'LC-ESI-qToF': 'LC-ESI-QTOF', 'LC-ESI-qTof': 'LC-ESI-QTOF', 'LC-ESIMS-qTOF': 'LC-ESI-ITFT', 'N/A-ESI-QFT': 'ESI-QFT', 'N/A-ESI-QTOF': 'ESI-QTOF',
                       'N/A-Linear Ion Trap': 'ESI-IT', 'N/A-N/A': 'ESI-QTOF', 'Negative-Quattro_QQQ:10eV': 'ESI-QQ', 'Negative-Quattro_QQQ:25eV': 'ESI-QQ',
                       'Negative-Quattro_QQQ:40eV': 'ESI-QQ', 'Positive-Quattro_QQQ:10eV': 'ESI-QQ', 'Positive-Quattro_QQQ:25eV': 'ESI-QQ', 'Positive-Quattro_QQQ:40eV': 'ESI-QQ'}

## Helper Functions

In [2]:
def get_all_spectra(path):

    spectra_list = [] 
    if path.endswith(".mgf"): spectrum_generator = load_from_mgf(path, metadata_harmonization = False)
    elif path.endswith(".msp"): spectrum_generator = load_from_msp(path, metadata_harmonization = False)
    else: return []

    for spectrum in tqdm(spectrum_generator):
        spectra_list.append(spectrum)
    
    return spectra_list

In [112]:
def get_unique_values(generator):

    adducts, modes, instruments, levels, energies = [],[],[],[], []
    for s in tqdm(generator):

        try:
            adducts.append(s.metadata["adduct"])
            modes.append(s.metadata["mode"])
            instruments.append(s.metadata["instrument_type"])
            levels.append(s.metadata["level"])
            energies.append(s.metadata["collision_energy"])
        except:
            continue

    adducts = collections.Counter(adducts)
    modes = collections.Counter(modes)
    instruments = collections.Counter(instruments)
    levels = collections.Counter(levels)
    energies = collections.Counter(energies)
    
    return adducts, modes, instruments, levels, energies

In [120]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

## 1: Load data

In [113]:
# Process the extra MS
extra_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "extra_MS.msp"))

# Process the massbank dataset
massbank_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "massbank.msp"))

# Process mona
mona_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "mona.msp"))

# Process GNPS 
GNPS_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "GNPS.msp"))

## 2: Get unique values

In [109]:
extra_dataset_check, massbank_dataset_check, mona_dataset_check, GNPS_dataset_check = True, True, True, False

In [114]:
if extra_dataset_check is False:
    print("Get extra dataset")
    extra_dataset_output = get_unique_values(extra_dataset_generator)
    extra_dataset_unique_adducts, extra_dataset_unique_modes, extra_dataset_unique_instruments, extra_dataset_unique_levels, extra_dataset_unique_energies = extra_dataset_output
    extra_dataset_check = True
    print() 

if massbank_dataset_check is False: 
    print("Get massbank dataset")
    massbank_dataset_output = get_unique_values(massbank_dataset_generator)
    massbank_dataset_unique_adducts, massbank_dataset_unique_modes, massbank_dataset_unique_instruments, massbank_dataset_unique_levels, massbank_dataset_unique_energies = massbank_dataset_output
    massbank_dataset_check = True
    print() 

if mona_dataset_check is False: 
    print("Get mona dataset")
    mona_dataset_output = get_unique_values(mona_dataset_generator)
    mona_dataset_unique_adducts, mona_dataset_unique_modes, mona_dataset_unique_instruments, mona_dataset_unique_levels, mona_dataset_unique_energies = mona_dataset_output
    mona_dataset_check = True
    print() 

if GNPS_dataset_check is False: 
    print("Get GNPS dataset")
    GNPS_dataset_output = get_unique_values(GNPS_dataset_generator)
    GNPS_dataset_unique_adducts, GNPS_dataset_unique_modes,GNPS_dataset_unique_instruments, GNPS_dataset_unique_levels, GNPS_dataset_unique_energies = GNPS_dataset_output
    GNPS_dataset_check = True
    print()

Get GNPS dataset


352057it [06:33, 894.53it/s] 







## 3. Look at breakdown of unique values

In [115]:
total_unique_adducts = extra_dataset_unique_adducts + massbank_dataset_unique_adducts + mona_dataset_unique_adducts + GNPS_dataset_unique_adducts
total_unique_modes = extra_dataset_unique_modes + massbank_dataset_unique_modes + mona_dataset_unique_modes + GNPS_dataset_unique_modes
total_unique_instruments = extra_dataset_unique_instruments + massbank_dataset_unique_instruments + mona_dataset_unique_instruments + GNPS_dataset_unique_instruments
total_unique_levels = extra_dataset_unique_levels + massbank_dataset_unique_levels + mona_dataset_unique_levels + GNPS_dataset_unique_levels
total_unique_energies = extra_dataset_unique_energies + massbank_dataset_unique_energies + mona_dataset_unique_energies + GNPS_dataset_unique_energies

In [66]:
total_unique_adducts.most_common(50)

[('[M+H]+', 290587),
 ('[M+Na]+', 115872),
 ('M+H', 65366),
 ('[M-H]-', 26492),
 ('M+NH4', 12170),
 ('M-H', 10623),
 ('M+Na', 10119),
 ('[M+NH4]+', 8157),
 ('[M+H]', 7389),
 ('[M]+', 4706),
 ('[M+H-H2O]+', 3441),
 ('M-H2O+H', 2512),
 ('2M-H', 1712),
 ('M+H-H2O', 1615),
 ('2M+H', 1606),
 ('M+formate', 1575),
 ('[M+HCOO]-', 1014),
 ('M+Cl', 971),
 ('2M+Na', 865),
 ('M-2H2O+H', 812),
 ('[M+H-2H2O]+', 681),
 ('[M+CH3COO]-', 604),
 ('[M+CH3COOH-H]-', 589),
 ('Unknown', 576),
 ('[M+CH3COO]-/[M-CH3]-', 564),
 ('[M-H2O+H]+', 532),
 ('M+K', 393),
 ('M+2H', 337),
 ('[M-2H]-', 311),
 ('2M-2H+Na', 273),
 ('M-e', 271),
 ('[2M+Na]+', 219),
 ('M', 189),
 ('M+2H]', 187),
 ('[M+K]+', 174),
 ('[M]+*', 161),
 ('[M-H]', 151),
 ('[M+Na]', 136),
 ('M-H+2Na', 122),
 ('M+', 117),
 ('[2M+H]+', 108),
 ('[M+2H]', 108),
 ('[M-2H2O+H]+', 101),
 ('[M-H2O]+', 92),
 ('[M+2H]++', 90),
 ('[2M+NH4]+', 90),
 ('M+ACN+H', 84),
 ('M+acetate', 76),
 ('2M+K', 74),
 ('[M-H]-/[M-C3H6NO2]-', 72)]

In [67]:
total_unique_modes.most_common()

[('Positive', 430010),
 ('POSITIVE', 60496),
 ('P', 34287),
 ('NEGATIVE', 29470),
 ('Negative', 16011),
 ('positive', 6472),
 ('Positive-20eV', 2),
 ('negative', 1)]

In [68]:
total_unique_instruments.most_common()

[('ESI-Orbitrap', 244382),
 ('Orbitrap', 100439),
 ('LC-ESI-QTOF', 52820),
 ('LC-ESI-ITFT', 31299),
 ('LC-ESI-Orbitrap', 29630),
 ('LC-ESI-QFT', 25275),
 ('LC-ESI-qTof', 15239),
 ('LC-ESI-CID; Velos', 14754),
 ('LC-ESI-HCD; Velos', 14752),
 ('LC-ESI-QQ', 8584),
 ('ESI-qTof', 7091),
 ('LC-ESI-Maxis II HD Q-TOF Bruker', 5820),
 ('ESI-Flow-injection QqQ/MS', 4250),
 ('ESI-QTOF', 3309),
 ('ESI-qToF', 2307),
 ('ESI-QQQ', 1465),
 ('ESI-Ion Trap', 1456),
 ('ESI-LC-Q-TOF/MS', 1041),
 ('LC-ESI- impact HD', 904),
 ('LC-ESI-IT', 888),
 ('LC-ESI-qToF', 746),
 ('DI-ESI-qTof', 676),
 ('Positive-Quattro_QQQ:10eV', 642),
 ('Positive-Quattro_QQQ:25eV', 636),
 ('Positive-Quattro_QQQ:40eV', 628),
 ('LC-ESI-CID; Lumos', 538),
 ('LC-ESI-HCD; Lumos', 537),
 ('ESI-QIT', 490),
 ('LC-ESI-Q-Exactive Plus', 484),
 ('GC-APCI-QTOF', 475),
 ('LC-Q-TOF/MS', 444),
 ('ESI-QqQ', 417),
 ('LC-ESI-Ion Trap', 373),
 ('ESI-IT/ion trap', 267),
 ('LC-ESI-TOF', 262),
 ('LC-APPI-QQ', 256),
 ('ESI-Q-TOF', 211),
 ('DI-ESI-LTQ-FT-

In [100]:
missing = 0 
for k in total_unique_instruments.keys():
    if not k in instruments_mapping.keys():
        print(k, total_unique_instruments[k])
        missing += total_unique_instruments[k]
print(missing, "cannot be matched")

Orbitrap 100439
GC-APCI-QTOF 475
ESI-TOF 56
MALDI-TOFTOF 4
FAB-EBEB 172
LC-ESI-Q 32
LC-ESI-TOF 262
ESI-ITFT 95
ESI-QQ 6
LC-ESI-ITTOF 13
ESI-ITTOF 5
MALDI-QIT 1
SI-BE 1
APCI-ITTOF 2
MALDI-QITTOF 2
FAB-BE 1
LC-APPI-QQ 256
LC-ESI-FT 37
LC-Q-TOF/MS 444
LC-ESI-Ger 44
DI-ESI-Hybrid Ft 1
DI-ESI-LTQ-FT-ICR 175
LC-ESI-LTQ-FTICR 12
ESI-Maxis 1
DI-ESI-Q-Exactive 1
EI-QQQ 1
LC-ESI-LC-ESI-QTOF 2
ESI-QqIT 2
in source ESI-QqQ 6
ESI-LIT 59
ESI-in source CID 18
ESI-QIT-TOF 4
102629 cannot be matched


In [116]:
total_unique_levels.most_common()

[('2', 452496), ('MS2', 122929), ('MS3', 929), ('MS', 325), ('MS4', 70)]

In [127]:
for (e, _) in total_unique_energies.most_common():
    if has_numbers(e) and " V" in e:
        print(e, total_unique_energies[e])

30 V 705
20 V 536
40 V 508
10 V 505
50 V 454
Ramp 5-60 V 426
Ramp 5-45 V 365
30.0 V 65
40.0 V 44
25.0 V 41
50.0 V 36
20.0 V 30
35.0 V 23
15.0 V 18
60 V 11
25 V 8
35 V 8
80 V 7
70 V 3
10.0 V 3
15 V 1
22 V 1
