## Import libraries

In [1]:
import os 
import re
import logging
import collections
from tqdm import tqdm
from config import raw_data_folder, processed_data_folder, GNPS_url_list
from matchms.importing import load_from_mgf, load_from_msp

logging.disable(logging.WARN)

## Settings

In [2]:
instruments_mapping = {'-Maxis HD qTOF': 'ESI-QTOF', 'ESI-QTOF': 'ESI-QTOF', '-Q-Exactive Plus Orbitrap Res 14k': 'ESI-QFT', '-Q-Exactive Plus Orbitrap Res 70k': 'ESI-QFT',
                       'APCI-Ion Trap': 'APCI-IT', 'APCI-Orbitrap': 'APCI-QFT', 'APCI-QQQ': 'APCI-QQ', 'APCI-qTof': 'APCI-QTOF', 'CI (MeOH)-IT/ion trap': 'CI-IT',
                       'CI-IT/ion trap': 'CI-IT', 'DI-ESI-Hybrid FT': 'ESI-QFT', 'DI-ESI-Ion Trap': 'ESI-IT', 'DI-ESI-Orbitrap': 'ESI-QFT',
                       'DI-ESI-Q-Exactive Plus': 'ESI-QFT', 'DI-ESI-QQQ': 'ESI-QQ', 'DI-ESI-qTof': 'ESI-QTOF', 
                       'DIRECT INFUSION NANOESI-ION TRAP-DIRECT INFUSION NANOESI-ION TRAP': 'ESI-IT', 'ESI or APCI-IT/ion trap': 'ESI-IT',
                       'APCI-ITFT': 'APCI-ITFT', 'LC-APCI-ITFT': 'APCI-ITFT', 'ESI-APCI-ITFT': 'APCI-ITFT', 'ESI-ESI-FTICR': 'ESI-FT', 'ESI-ESI-ITFT': 'ESI-ITFT', 'ESI-FAB-EBEB': 'FAB-EBEB',
                       'ESI-Flow-injection QqQ/MS': 'ESI-QQ', 'ESI-HCD': 'ESI-QFT', 'ESI-HPLC-ESI-TOF': 'LC-ESI-TOF', 'ESI-Hybrid FT': 'ESI-QFT',
                       'ESI-IT-FT/ion trap with FTMS': 'ESI-ITFT', 'ESI-IT/ion trap': 'ESI-IT', 'ESI-Ion Trap': 'ESI-IT', 'ESI-LC-APPI-QQ': 'LC-APPI-QQ',
                       'LC-ESI-IT': 'LC-ESI-IT', 'ESI-LC-ESI-IT': 'LC-ESI-IT', 'ESI-LC-ESI-ITFT': 'LC-ESI-ITFT', 'LC-ESI-ITFT': 'LC-ESI-ITFT', 'ESI-LC-ESI-ITTOF': 'LC-ESI-ITTOF', 'ESI-LC-ESI-Q': 'LC-ESI-Q',
                       'LC-ESI-QFT':'LC-ESI-QFT', 'ESI-LC-ESI-QFT': 'LC-ESI-QFT', 'LC-ESI-QQ':'LC-ESI-QQ', 'ESI-LC-ESI-QQ': 'LC-ESI-QQ', 'ESI-LC-ESI-QTOF': 'LC-ESI-QTOF', 'LC-ESI-QTOF': 'LC-ESI-QTOF', 'ESI-LC-Q-TOF/MS': 'LC-ESI-QTOF',
                       'ESI-Orbitrap': 'ESI-ITFT', 'ESI-Q-TOF': 'ESI-QTOF', 'ESI-QIT': 'ESI-QIT', 'ESI-QQQ': 'ESI-QQ', 'ESI-QqQ': 'ESI-QQ', 'ESI-UPLC-ESI-QTOF': 'LC-ESI-QTOF',
                       'ESI-qTOF': 'ESI-QTOF', 'ESI-qToF': 'ESI-QTOF', 'ESI-qTof': 'ESI-QTOF', 'FAB-BEqQ/magnetic and electric sectors with quadrupole': 'FAB-BEQQ',
                       'In-source CID-API': 'ESI-QQ', 'LC-APCI-qTof': 'LC-APCI-QTOF', 'LC-ESI- impact HD': 'LC-ESI-QTOF', 'LC-ESI-CID; Lumos': 'LC-ESI-ITFT',
                       'LC-ESI-CID; Velos': 'LC-ESI-ITFT', 'LC-ESI-HCD; Lumos': 'LC-ESI-ITFT', 'LC-ESI-HCD; Velos': 'LC-ESI-ITFT', 'LC-ESI-Hybrid FT': 'LC-ESI-QFT',
                       'LC-ESI-Hybrid Ft': 'LC-ESI-QFT', 'LC-ESI-ITFT-LC-ESI-ITFT': 'LC-ESI-ITFT', 'LC-ESI-ITTOF-LC-ESI-ITTOF': 'LC-ESI-ITTOF', 'LC-ESI-Ion Trap': 'LC-ESI-IT',
                       'LC-ESI-LCQ': 'LC-ESI-IT', 'LC-ESI-Maxis HD qTOF': 'LC-ESI-QTOF', 'LC-ESI-Maxis II HD Q-TOF Bruker': 'LC-ESI-QTOF', 'LC-ESI-Orbitrap': 'LC-ESI-ITFT',
                       'LC-ESI-Q-Exactive Plus': 'LC-ESI-QFT', 'LC-ESI-Q-Exactive Plus Orbitrap Res 14k': 'LC-ESI-QFT', 'LC-ESI-Q-Exactive Plus Orbitrap Res 70k': 'LC-ESI-QFT',
                       'LC-ESI-QQ-LC-ESI-QQ': 'LC-ESI-QQ', 'LC-ESI-QQQ': 'LC-ESI-QQ', 'LC-ESI-QTOF-LC-ESI-QTOF': 'LC-ESI-QTOF', 'LC-ESI-qTOF': 'LC-ESI-QTOF',
                       'LC-ESI-qToF': 'LC-ESI-QTOF', 'LC-ESI-qTof': 'LC-ESI-QTOF', 'LC-ESIMS-qTOF': 'LC-ESI-ITFT', 'N/A-ESI-QFT': 'ESI-QFT', 'N/A-ESI-QTOF': 'ESI-QTOF',
                       'N/A-Linear Ion Trap': 'ESI-IT', 'N/A-N/A': 'ESI-QTOF', 'Negative-Quattro_QQQ:10eV': 'ESI-QQ', 'Negative-Quattro_QQQ:25eV': 'ESI-QQ',
                       'Negative-Quattro_QQQ:40eV': 'ESI-QQ', 'Positive-Quattro_QQQ:10eV': 'ESI-QQ', 'Positive-Quattro_QQQ:25eV': 'ESI-QQ', 'Positive-Quattro_QQQ:40eV': 'ESI-QQ'}

## Helper Functions

In [3]:
def get_all_spectra(path):

    spectra_list = [] 
    if path.endswith(".mgf"): spectrum_generator = load_from_mgf(path, metadata_harmonization = False)
    elif path.endswith(".msp"): spectrum_generator = load_from_msp(path, metadata_harmonization = False)
    else: return []

    for i, spectrum in tqdm(enumerate(spectrum_generator)):
        spectra_list.append(spectrum)
    
    return spectra_list

In [4]:
def get_unique_values(generator):

    adducts, modes, instruments, levels, energies = [],[],[],[], []
    for i, s in tqdm(enumerate(generator)):
        try:
            adducts.append(s.metadata["adduct"])
            modes.append(s.metadata["mode"])
            instruments.append(s.metadata["instrument_type"])
            levels.append(s.metadata["level"])
            energies.append(s.metadata["collision_energy"])
        except Exception as e:
            print(e)
            continue

    adducts = collections.Counter(adducts)
    modes = collections.Counter(modes)
    instruments = collections.Counter(instruments)
    levels = collections.Counter(levels)
    energies = collections.Counter(energies)
    
    return adducts, modes, instruments, levels, energies

In [5]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [6]:
def is_float(s):
    try: 
        float(s)
        return True
    except:
        return False

## 1: Load data

In [10]:
# Process the extra MS
extra_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "extra_MS.msp"))

# Process the massbank dataset
massbank_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "massbank.msp"))

# Process mona
mona_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "mona.msp"))

# Process GNPS 
GNPS_dataset_generator = load_from_msp(os.path.join(processed_data_folder, "GNPS.msp"))

## 2: Get unique values

In [11]:
extra_dataset_check, massbank_dataset_check, mona_dataset_check, GNPS_dataset_check = False, False, False, False

In [12]:
if extra_dataset_check is False:
    print("Get extra dataset")
    extra_dataset_output = get_unique_values(extra_dataset_generator)
    extra_dataset_unique_adducts, extra_dataset_unique_modes, extra_dataset_unique_instruments, extra_dataset_unique_levels, extra_dataset_unique_energies = extra_dataset_output
    extra_dataset_check = True
    print() 

if massbank_dataset_check is False: 
    print("Get massbank dataset")
    massbank_dataset_output = get_unique_values(massbank_dataset_generator)
    massbank_dataset_unique_adducts, massbank_dataset_unique_modes, massbank_dataset_unique_instruments, massbank_dataset_unique_levels, massbank_dataset_unique_energies = massbank_dataset_output
    massbank_dataset_check = True
    print() 

if mona_dataset_check is False: 
    print("Get mona dataset")
    mona_dataset_output = get_unique_values(mona_dataset_generator)
    mona_dataset_unique_adducts, mona_dataset_unique_modes, mona_dataset_unique_instruments, mona_dataset_unique_levels, mona_dataset_unique_energies = mona_dataset_output
    mona_dataset_check = True
    print() 

if GNPS_dataset_check is False: 
    print("Get GNPS dataset")
    GNPS_dataset_output = get_unique_values(GNPS_dataset_generator)
    GNPS_dataset_unique_adducts, GNPS_dataset_unique_modes,GNPS_dataset_unique_instruments, GNPS_dataset_unique_levels, GNPS_dataset_unique_energies = GNPS_dataset_output
    GNPS_dataset_check = True
    print()

Get extra dataset


100439it [01:20, 1250.64it/s]



Get massbank dataset


86740it [00:58, 1489.92it/s]



Get mona dataset


32747it [00:21, 1511.74it/s]



Get GNPS dataset


35051it [00:23, 1518.85it/s]







## 3. Look at the number of data points for each data set 

## 4. Look at breakdown of unique values

In [13]:
total_unique_adducts = extra_dataset_unique_adducts + massbank_dataset_unique_adducts + mona_dataset_unique_adducts + GNPS_dataset_unique_adducts
total_unique_modes = extra_dataset_unique_modes + massbank_dataset_unique_modes + mona_dataset_unique_modes + GNPS_dataset_unique_modes
total_unique_instruments = extra_dataset_unique_instruments + massbank_dataset_unique_instruments + mona_dataset_unique_instruments + GNPS_dataset_unique_instruments
total_unique_levels = extra_dataset_unique_levels + massbank_dataset_unique_levels + mona_dataset_unique_levels + GNPS_dataset_unique_levels
total_unique_energies = extra_dataset_unique_energies + massbank_dataset_unique_energies + mona_dataset_unique_energies + GNPS_dataset_unique_energies

In [17]:
for a, c in total_unique_adducts.most_common():
    if a[-1] == "-": continue 
    print(a, c)

[M+H]+ 187663
[M+Na]+ 8419
[M+NH4]+ 8060
[M]+ 4790
[M+H-H2O]+ 4252
[M+H-2H2O]+ 681
[Unknown]+ 576
[M-e]+ 271
[2M+H]+ 263
[M-H2O+H]+ 210
[M+2H]]+ 187
[M-H+2Na]+ 121
[M+K]+ 99
[M-H2O]+ 92
[M+2H]++ 90
[M+H-NH3]+ 32
[M-2H2O+H]+ 24
[M-H+Na]+ 21
[M+]+ 19
[2M+Na]+ 12
[M+H]]+ 12
[M]++ 10
[M+2Na]]+ 9
[M+2H]+ 3
[M-OH]+ 2
[M-H]+ 2
[M+H-C9H10O5]+ 2
[M-C6H10O5+H]+ 2
[M]+* 2
[M+H-C8H10O]+ 1


In [18]:
total_unique_modes.most_common()

[('Positive', 124740),
 ('POSITIVE', 58450),
 ('P', 32747),
 ('NEGATIVE', 28290),
 ('Negative', 10750)]

In [19]:
count, total = 0, 0 
for i, c in total_unique_instruments.most_common():
    if i not in instruments_mapping.keys():
        count += c 
    total += c

print(count, total, count/total * 100)

764 254977 0.2996348690274025


In [20]:
total_unique_levels.most_common()

[('2', 135490), ('MS2', 118787), ('MS3', 636), ('MS', 64)]

In [26]:
for e, c in total_unique_energies.most_common():
    e = float(e.replace("(max)", "").strip())
    print(float(int(float(e))))

60.0
20.0
30.0
40.0
40.0
60.0
6.0
10.0
15.0
45.0
50.0
90.0
75.0
70.0
80.0
100.0
35.0
110.0
120.0
130.0
140.0
150.0
25.0
5.0
55.0
65.0
0.0
0.0
0.0
0.0
0.0
0.0
85.0
42.0
21.0
10.0
18.0
59.0
10.0
21.0
24.0
42.0
28.0
52.0
35.0
45.0
17.0
14.0
31.0
56.0
49.0
38.0
29.0
63.0
17.0
16.0
14.0
16.0
26.0
32.0
5.0
34.0
24.0
37.0
40.0
29.0
8.0
0.0
39.0
9.0
9.0
8.0
13.0
18.0
25.0
14.0
0.0
7.0
42.0
66.0
31.0
37.0
44.0
70.0
28.0
39.0
27.0
24.0
16.0
41.0
21.0
65.0
8.0
43.0
57.0
37.0
19.0
13.0
13.0
16.0
15.0
19.0
13.0
15.0
13.0
4.0
4.0
40.0
7.0
4.0
20.0
18.0
15.0
33.0
12.0
23.0
13.0
34.0
6.0
39.0
20.0
47.0
50.0
27.0
24.0
6.0
12.0
27.0
14.0
22.0
38.0
22.0
18.0
30.0
7.0
9.0
31.0
24.0
9.0
6.0
20.0
15.0
13.0
30.0
26.0
48.0
34.0
32.0
28.0
43.0
27.0
16.0
16.0
40.0
22.0
16.0
12.0
14.0
13.0
18.0
19.0
14.0
14.0
13.0
19.0
17.0
10.0
19.0
14.0
13.0
10.0
21.0
15.0
8.0
12.0
46.0
50.0
22.0
7.0
30.0
8.0
33.0
18.0
19.0
15.0
36.0
12.0
32.0
25.0
15.0
37.0
54.0
31.0
6.0
45.0
25.0
20.0
10.0
42.0
41.0
21.0
38.0
30.0
10.0
16.0
