### Load mass spec data
Libraries are downloaded on 13/05/2024 from:
- GNPS: https://gnps-external.ucsd.edu/gnpslibrary
- Mona: https://mona.fiehnlab.ucdavis.edu/downloads
- MassBank: https://github.com/MassBank/MassBank-data/releases
- Public libary created in https://doi.org/10.26434/chemrxiv-2024-l1tqh Downloaded from: https://zenodo.org/records/11163381 From now on named Corinna Bruns Library.

From GNPS the following libraries were downloaded:
'BERKELEY-LAB.mgf',
 'BILELIB19.mgf',
 'BIRMINGHAM-UHPLC-MS-NEG.mgf',
 'BIRMINGHAM-UHPLC-MS-POS.mgf',
 'BMDMS-NP.mgf',
 'CASMI.mgf',
 'DRUGS-OF-ABUSE-LIBRARY.mgf',
 'ECG-ACYL-AMIDES-C4-C24-LIBRARY.mgf',
 'ECG-ACYL-ESTERS-C4-C24-LIBRARY.mgf',
 'GNPS-COLLECTIONS-MISC.mgf',
 'GNPS-COLLECTIONS-PESTICIDES-NEGATIVE.mgf',
 'GNPS-COLLECTIONS-PESTICIDES-POSITIVE.mgf',
 'GNPS-D2-AMINO-LIPID-LIBRARY.mgf',
 'GNPS-EMBL-MCF.mgf',
 'GNPS-FAULKNERLEGACY.mgf',
 'GNPS-IOBA-NHC.mgf',
 'GNPS-LIBRARY.mgf',
 'GNPS-MSMLS.mgf',
 'GNPS-NIH-CLINICALCOLLECTION1.mgf',
 'GNPS-NIH-CLINICALCOLLECTION2.mgf',
 'GNPS-NIH-NATURALPRODUCTSLIBRARY.mgf',
 'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_NEGATIVE.mgf',
 'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_POSITIVE.mgf',
 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE.mgf',
 'GNPS-NIST14-MATCHES.mgf',
 'GNPS-NUTRI-METAB-FEM-NEG.mgf',
 'GNPS-NUTRI-METAB-FEM-POS.mgf',
 'GNPS-PRESTWICKPHYTOCHEM.mgf',
 'GNPS-SAM-SIK-KANG-LEGACY-LIBRARY.mgf',
 'GNPS-SCIEX-LIBRARY.mgf',
 'GNPS-SELLECKCHEM-FDA-PART1.mgf',
 'GNPS-SELLECKCHEM-FDA-PART2.mgf',
 'HCE-CELL-LYSATE-LIPIDS.mgf',
 'HMDB.mgf',
 'IQAMDB.mgf',
 'LDB_NEGATIVE.mgf',
 'LDB_POSITIVE.mgf',
 'MIADB.mgf',
 'MMV_NEGATIVE.mgf',
 'MMV_POSITIVE.mgf',
 'PNNL-LIPIDS-NEGATIVE.mgf',
 'PNNL-LIPIDS-POSITIVE.mgf',
 'PSU-MSMLS.mgf',
 'RESPECT.mgf',
 'SUMNER.mgf',
 'UM-NPDC.mgf'

From Mona the following libraries were downloaded: "LC-MS Spectra"

From MassBank version 2023.11 was downloaded

From https://zenodo.org/records/11163381 the files "20231031_nihnp_library_neg_all_lib_MSn.mgf", "20231130_mcescaf_library_neg_all_lib_MSn.mgf",
"20231130_otavapep_library_neg_all_lib_MSn.mgf", "20240411_mcebio_library_neg_all_lib_MSn.mgf",                  "20231031_nihnp_library_pos_all_lib_MSn.mgf", "20231130_mcescaf_library_pos_all_lib_MSn.mgf",                  "20231130_otavapep_library_pos_all_lib_MSn.mgf", "20240411_mcebio_library_pos_all_lib_MSn.mgf" where downloaded

## Remove merged spectra
Corinna Bruns library contained merged spectra, which were removed.

In [2]:
raw_library_folder = "unprocessed_libraries/"

In [None]:
from matchms.importing.load_from_mgf import load_from_mgf
from matchms.exporting.save_as_mgf import save_as_mgf
from tqdm import tqdm
import os

corinna_bruns_library = []
for file_name in ("20231031_nihnp_library_neg_all_lib_MSn.mgf", "20231130_mcescaf_library_neg_all_lib_MSn.mgf",
                  "20231130_otavapep_library_neg_all_lib_MSn.mgf", "20240411_mcebio_library_neg_all_lib_MSn.mgf",
                  "20231031_nihnp_library_pos_all_lib_MSn.mgf", "20231130_mcescaf_library_pos_all_lib_MSn.mgf",
                  "20231130_otavapep_library_pos_all_lib_MSn.mgf", "20240411_mcebio_library_pos_all_lib_MSn.mgf"):
    spectra = list(load_from_mgf(os.path.join(raw_library_folder, file_name)))
    for spectrum in tqdm(spectra):
        if spectrum.get("spectype") is None:
            corinna_bruns_library.append(spectrum)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85569/85569 [00:05<00:00, 16038.92it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75704/75704 [00:04<00:00, 15853.53it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31955/31955 [00:02<00:00, 15686.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 130364/130364 [00:08<00:00, 15309.93it/s]
100%|███████████████████████████████████████████████████████████████

In [None]:
save_as_mgf(corinna_bruns_library, os.path.join(raw_library_folder, "ms2_spectra_corinna.mgf"))

## Download GNPS Library

In [16]:
import requests
 
for file_name in ('BERKELEY-LAB.mgf', 'BILELIB19.mgf', 'BIRMINGHAM-UHPLC-MS-NEG.mgf', 'BIRMINGHAM-UHPLC-MS-POS.mgf', 'BMDMS-NP.mgf', 'CASMI.mgf', 'DRUGS-OF-ABUSE-LIBRARY.mgf', 'ECG-ACYL-AMIDES-C4-C24-LIBRARY.mgf', 'ECG-ACYL-ESTERS-C4-C24-LIBRARY.mgf', 'GNPS-COLLECTIONS-MISC.mgf', 'GNPS-COLLECTIONS-PESTICIDES-NEGATIVE.mgf', 'GNPS-COLLECTIONS-PESTICIDES-POSITIVE.mgf', 'GNPS-D2-AMINO-LIPID-LIBRARY.mgf', 'GNPS-EMBL-MCF.mgf', 'GNPS-FAULKNERLEGACY.mgf', 'GNPS-IOBA-NHC.mgf', 'GNPS-LIBRARY.mgf', 'GNPS-MSMLS.mgf', 'GNPS-NIH-CLINICALCOLLECTION1.mgf', 'GNPS-NIH-CLINICALCOLLECTION2.mgf', 'GNPS-NIH-NATURALPRODUCTSLIBRARY.mgf', 'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_NEGATIVE.mgf', 'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_POSITIVE.mgf', 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE.mgf', 'GNPS-NIST14-MATCHES.mgf', 'GNPS-NUTRI-METAB-FEM-NEG.mgf', 'GNPS-NUTRI-METAB-FEM-POS.mgf', 'GNPS-PRESTWICKPHYTOCHEM.mgf', 'GNPS-SAM-SIK-KANG-LEGACY-LIBRARY.mgf', 'GNPS-SCIEX-LIBRARY.mgf', 'GNPS-SELLECKCHEM-FDA-PART1.mgf', 'GNPS-SELLECKCHEM-FDA-PART2.mgf', 'HCE-CELL-LYSATE-LIPIDS.mgf', 'HMDB.mgf', 'IQAMDB.mgf', 'LDB_NEGATIVE.mgf', 'LDB_POSITIVE.mgf', 'MIADB.mgf', 'MMV_NEGATIVE.mgf', 'MMV_POSITIVE.mgf', 'PNNL-LIPIDS-NEGATIVE.mgf', 'PNNL-LIPIDS-POSITIVE.mgf', 'PSU-MSMLS.mgf', 'RESPECT.mgf', 'SUMNER.mgf', 'UM-NPDC.mgf'):
    gnps_url = "https://gnps-external.ucsd.edu/gnpslibrary/" + file_name
    response = requests.get(gnps_url)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Open the local file in write-binary mode and write the contents of the response
        with open(os.path.join(raw_library_folder, file_name), 'wb') as f:
            f.write(response.content)
        print(f'File downloaded successfully and saved as {file_name}')
    else:
        print(f'Failed to download file. HTTP Status Code: {response.status_code}')
        print(response)

File downloaded successfully and saved as BERKELEY-LAB.mgf
File downloaded successfully and saved as BILELIB19.mgf
File downloaded successfully and saved as BIRMINGHAM-UHPLC-MS-NEG.mgf
File downloaded successfully and saved as BIRMINGHAM-UHPLC-MS-POS.mgf
File downloaded successfully and saved as BMDMS-NP.mgf
File downloaded successfully and saved as CASMI.mgf
File downloaded successfully and saved as DRUGS-OF-ABUSE-LIBRARY.mgf
File downloaded successfully and saved as ECG-ACYL-AMIDES-C4-C24-LIBRARY.mgf
File downloaded successfully and saved as ECG-ACYL-ESTERS-C4-C24-LIBRARY.mgf
File downloaded successfully and saved as GNPS-COLLECTIONS-MISC.mgf
File downloaded successfully and saved as GNPS-COLLECTIONS-PESTICIDES-NEGATIVE.mgf
File downloaded successfully and saved as GNPS-COLLECTIONS-PESTICIDES-POSITIVE.mgf
File downloaded successfully and saved as GNPS-D2-AMINO-LIPID-LIBRARY.mgf
File downloaded successfully and saved as GNPS-EMBL-MCF.mgf
File downloaded successfully and saved as GNPS-

## Merge libraries

In [None]:
from matchms.importing.load_spectra import load_spectra
from matchms.exporting.save_as_mgf import save_as_mgf
from tqdm import tqdm

gnps_file_names = ('BERKELEY-LAB.mgf', 'BILELIB19.mgf', 'BIRMINGHAM-UHPLC-MS-NEG.mgf', 'BIRMINGHAM-UHPLC-MS-POS.mgf', 'BMDMS-NP.mgf', 'CASMI.mgf', 'DRUGS-OF-ABUSE-LIBRARY.mgf', 'ECG-ACYL-AMIDES-C4-C24-LIBRARY.mgf', 'ECG-ACYL-ESTERS-C4-C24-LIBRARY.mgf', 'GNPS-COLLECTIONS-MISC.mgf', 'GNPS-COLLECTIONS-PESTICIDES-NEGATIVE.mgf', 'GNPS-COLLECTIONS-PESTICIDES-POSITIVE.mgf', 'GNPS-D2-AMINO-LIPID-LIBRARY.mgf', 'GNPS-EMBL-MCF.mgf', 'GNPS-FAULKNERLEGACY.mgf', 'GNPS-IOBA-NHC.mgf', 'GNPS-LIBRARY.mgf', 'GNPS-MSMLS.mgf', 'GNPS-NIH-CLINICALCOLLECTION1.mgf', 'GNPS-NIH-CLINICALCOLLECTION2.mgf', 'GNPS-NIH-NATURALPRODUCTSLIBRARY.mgf', 'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_NEGATIVE.mgf', 'GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_POSITIVE.mgf', 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE.mgf', 'GNPS-NIST14-MATCHES.mgf', 'GNPS-NUTRI-METAB-FEM-NEG.mgf', 'GNPS-NUTRI-METAB-FEM-POS.mgf', 'GNPS-PRESTWICKPHYTOCHEM.mgf', 'GNPS-SAM-SIK-KANG-LEGACY-LIBRARY.mgf', 'GNPS-SCIEX-LIBRARY.mgf', 'GNPS-SELLECKCHEM-FDA-PART1.mgf', 'GNPS-SELLECKCHEM-FDA-PART2.mgf', 'HCE-CELL-LYSATE-LIPIDS.mgf', 'HMDB.mgf', 'IQAMDB.mgf', 'LDB_NEGATIVE.mgf', 'LDB_POSITIVE.mgf', 'MIADB.mgf', 'MMV_NEGATIVE.mgf', 'MMV_POSITIVE.mgf', 'PNNL-LIPIDS-NEGATIVE.mgf', 'PNNL-LIPIDS-POSITIVE.mgf', 'PSU-MSMLS.mgf', 'RESPECT.mgf', 'SUMNER.mgf', 'UM-NPDC.mgf')
spectra = []
for file_name in ("MassBank_NIST.msp",
                  "MoNA-export-LC-MS_Spectra.msp",
                  "ms2_spectra_corinna.mgf",) + gnps_file_names:
    spectrum_generator = load_spectra(os.path.join(raw_library_folder, file_name), metadata_harmonization=False)
    for spectrum in tqdm(spectrum_generator):
        spectra.append(spectrum)

save_as_mgf(spectra, os.path.join(raw_library_folder, "merged_libraries.mgf"))

117732it [01:13, 1603.60it/s]
160993it [11:36, 231.03it/s] 
675902it [03:29, 3230.71it/s]
25009it [00:05, 4334.92it/s]
5008it [00:03, 1445.71it/s]
7232it [00:01, 4592.55it/s]
7058it [00:01, 4342.81it/s]
227307it [00:55, 4082.86it/s]
562it [00:00, 4516.46it/s]
480it [00:00, 3085.45it/s]
3144it [00:00, 3441.31it/s]
496it [00:00, 3979.66it/s]
46it [00:00, 2908.63it/s]
76it [00:00, 3711.56it/s]
653it [00:00, 1979.92it/s]
54it [00:00, 2903.67it/s]
585it [00:00, 4133.56it/s]
127it [00:00, 3803.99it/s]
196it [00:00, 1814.08it/s]
14312it [00:10, 1384.05it/s]
863it [00:00, 4029.58it/s]
377it [00:00, 1084.91it/s]
195it [00:00, 1427.39it/s]
1267it [00:02, 464.49it/s]
1863it [00:00, 3728.18it/s]
7915it [00:04, 1826.42it/s]
1460it [00:04, 308.64it/s]
5763it [00:02, 2317.36it/s]
2239it [00:00, 3332.39it/s]
2543it [00:00, 2971.84it/s]
143it [00:00, 651.02it/s]
336it [00:00, 3417.52it/s]
314it [00:00, 4816.63it/s]
2388it [00:00, 2433.43it/s]
656it [00:00, 730.12it/s]
115it [00:00, 260.90it/s]
2235it [