In [1]:
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

In [2]:
import sys
from pathlib import Path

# Set project folder so we can use paths.py
sys.path.append(str(Path().resolve().parent))
from paths import PROJECT_ROOT

# High-level imports
from dreams.utils.data import MSData
from dreams.api import dreams_embeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import dreams.utils.spectra as su
import pandas as pd
from tqdm import tqdm

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [3]:
# 1) Define paths ----------------------------------

# MassSpecGym MGF file
mgf_path      = PROJECT_ROOT / "data" / "massspecgym" / "MassSpecGym.mgf"
# The HDF5 we will create alongside it
h5_path       = mgf_path.with_suffix(".hdf5")
# Where to save the library embeddings
lib_emb_path  = h5_path.with_name(h5_path.stem + "_dreams_emb.npy")

# Your query mzML file
mzml_path     = PROJECT_ROOT / "data" / "rawfiles" / "202312_20_P09-Leaf-r1_1uL.mzML"
# Where to save your query embeddings
query_emb_path = mzml_path.with_name(mzml_path.stem + "_dreams_emb.npy")

In [4]:
# 2) Convert the MGF library to HDF5 ----------------

# This reads the .mgf and writes MassSpecGym.hdf5 next to it
msdata_lib = MSData.from_mgf(str(mgf_path), prec_mz_col='PRECURSOR_MZ')
# Confirm it wrote the file
print("Library HDF5:", h5_path, "| # spectra:", len(msdata_lib))

Loading dataset MassSpecGym into memory (231104 spectra)...
Library HDF5: /Users/macbook/CODE/DreaMS_MIMB/data/massspecgym/MassSpecGym.hdf5 | # spectra: 231104


In [10]:
# msdata_lib = MSData.load(str(mgf_path))

ValueError: Column "precursor_mz" is not present in the dataframe. Available columns: Index(['spectrum', 'IDENTIFIER', 'smiles', 'INCHIKEY', 'FORMULA',
       'PRECURSOR_FORMULA', 'PARENT_MASS', 'PRECURSOR_MZ', 'adduct',
       'INSTRUMENT_TYPE', 'COLLISION_ENERGY', 'FOLD', 'SIMULATION_CHALLENGE'],
      dtype='object').

In [5]:
# 3) Compute or load library embeddings --------------

if lib_emb_path.exists():
    embs_lib = np.load(str(lib_emb_path))
    print("Loaded saved library embeddings:", embs_lib.shape)
else:
    embs_lib = dreams_embeddings(msdata_lib)
    np.save(str(lib_emb_path), embs_lib)
    print("Computed & saved library embeddings:", embs_lib.shape)

Loaded saved library embeddings: (231104, 1024)


In [6]:
# 4) Compute or load your query embeddings ------------

# Load your spectra into MSData (writes .hdf5 automatically)
msdata_q = MSData.from_mzml(str(mzml_path))
print("Query HDF5 created:", mzml_path.with_suffix(".hdf5"), "| # spectra:", len(msdata_q))

if query_emb_path.exists():
    embs_q = np.load(str(query_emb_path))
    print("Loaded saved query embeddings:", embs_q.shape)
else:
    embs_q = dreams_embeddings(msdata_q)
    np.save(str(query_emb_path), embs_q)
    print("Computed & saved query embeddings:", embs_q.shape)

Loading dataset 202312_20_P09-Leaf-r1_1uL into memory (5681 spectra)...
Query HDF5 created: /Users/macbook/CODE/DreaMS_MIMB/data/rawfiles/202312_20_P09-Leaf-r1_1uL.hdf5 | # spectra: 5681


Computing DreaMS embedding: 100%|██████████| 5681/5681 [02:05<00:00, 45.28it/s]


Computed & saved query embeddings: (5681, 1024)


In [7]:
# 5) Compute similarities ----------------------------

# cosine_similarity returns a matrix [n_query × n_lib]
sims = cosine_similarity(embs_q, embs_lib)

In [8]:
# 6) Find top-5 matches per query --------------------

k = 5
topk = np.argsort(sims, axis=1)[:, -k:][:, ::-1]

In [9]:
# 7) Build results table ------------------------------

rows = []
cos_sim = su.PeakListModifiedCosine()

for i_query, top5 in enumerate(tqdm(topk, desc="Matching spectra")):
    for rank, i_lib in enumerate(top5, start=1):
        rows.append({
            "query_index": i_query,
            "rank": rank,
            "library_index": int(i_lib),
            "library_SMILES": msdata_lib.get_smiles(i_lib),
            "library_ID": msdata_lib.get_values("IDENTIFIER", i_lib),
            "DreaMS_similarity": float(sims[i_query, i_lib]),
            "Modified_cosine_similarity": cos_sim(
                spec1=msdata_q.get_spectra(i_query),
                prec_mz1=msdata_q.get_prec_mzs(i_query),
                spec2=msdata_lib.get_spectra(i_lib),
                prec_mz2=msdata_lib.get_prec_mzs(i_lib),
            )
        })

df = pd.DataFrame(rows)

Matching spectra: 100%|██████████| 5681/5681 [00:03<00:00, 1547.09it/s]


In [10]:
# 8) Save results -------------------------------------

# Make a results folder if it doesn't exist
results_folder = PROJECT_ROOT / "results"
results_folder.mkdir(exist_ok=True)

out_csv = results_folder / "library_matching_results.csv"
df.to_csv(out_csv, index=False)
print("Library matching results saved to:", out_csv)

Library matching results saved to: /Users/macbook/CODE/DreaMS_MIMB/results/library_matching_results.csv
