In [1]:
import sys
from pathlib import Path

# assume this notebook lives in notebooks/, so parent() is the repo root
sys.path.append(str(Path().resolve().parent))
from paths import PROJECT_ROOT

import numpy as np
import pandas as pd
from tqdm import tqdm

from dreams.utils.data import MSData
from dreams.api import dreams_embeddings
from sklearn.metrics.pairwise import cosine_similarity
import dreams.utils.spectra as su

/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/lightning_fabric/__init__.py:41: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [2]:
# 1) Define paths ----------------------------------------------------------
# 1a) MassSpecGym library
mgf_path      = PROJECT_ROOT / "data" / "massspecgym" / "MassSpecGym.mgf"
lib_h5        = mgf_path.with_suffix(".hdf5")
lib_emb_path  = lib_h5.with_name(lib_h5.stem + "_dreams_emb.npy")

# 1b) Our query: deduplicated, high-quality spectra from step 2
query_h5       = PROJECT_ROOT / "data" / "rawfiles" / "202312_20_P09-Leaf-r1_1uL_high_quality_dedup.hdf5"
query_emb_path = query_h5.with_name(query_h5.stem + "_dreams_emb.npy")

In [3]:
# 1c) Results folder
results_folder = PROJECT_ROOT / "results"
results_folder.mkdir(exist_ok=True)

### DreaMS needs at the input mass spectra and precursor mz

#### Furher if you do not have embeddings computed for MassSpecGym, it will now compute, it can take approximatelly one hour but after that you can reuse it, and it is done in no time then.

In [4]:
# 2) Load MassSpecGym library (or convert if needed) -----------------------
if lib_h5.exists():
    msdata_lib = MSData.from_hdf5(lib_h5, in_mem=True)
else:
    # parse .mgf → .hdf5
    msdata_lib = MSData.from_mgf(str(mgf_path), prec_mz_col="PRECURSOR_MZ")
print(f"Library loaded: {len(msdata_lib)} spectra → {lib_h5.name}")
print("Columns:", msdata_lib.columns())

Loading dataset MassSpecGym into memory (231104 spectra)...
Library loaded: 231104 spectra → MassSpecGym.hdf5
Columns: ['COLLISION_ENERGY', 'FOLD', 'FORMULA', 'IDENTIFIER', 'INCHIKEY', 'INSTRUMENT_TYPE', 'PARENT_MASS', 'PRECURSOR_FORMULA', 'SIMULATION_CHALLENGE', 'adduct', 'precursor_mz', 'smiles', 'spectrum']


### Now we will compute embeddings for our investigated data

In [5]:
# 3) Compute or load library embeddings -----------------------------------
if lib_emb_path.exists():
    embs_lib = np.load(lib_emb_path)
    print("Loaded saved library embeddings:", embs_lib.shape)
else:
    embs_lib = dreams_embeddings(msdata_lib)
    np.save(lib_emb_path, embs_lib)
    print("Computed & saved library embeddings:", embs_lib.shape)

Loaded saved library embeddings: (231104, 1024)


#### Here we load our preprocessed data from tutorial 1 & 2. But i you do not have you processed, and would like to continue with processing, you can do so, just call method TODO MSData....

In [6]:
# 4) Load query spectra & compute/load query embeddings -------------------
if query_h5.exists():
    msdata_q = MSData.from_hdf5(query_h5, in_mem=True)
else:
    raise FileNotFoundError(f"Expected dedup HDF5 at {query_h5}")
print(f"Query loaded:   {len(msdata_q)} spectra → {query_h5.name}")

Loading dataset 202312_20_P09-Leaf-r1_1uL_high_quality_dedup into memory (1558 spectra)...
Query loaded:   1558 spectra → 202312_20_P09-Leaf-r1_1uL_high_quality_dedup.hdf5


#### Now we want to compute embeddings for our data. We have to do it, otherwise we will not be able to do similarity matching

In [7]:
if query_emb_path.exists():
    embs_q = np.load(query_emb_path)
    print("Loaded saved query embeddings:", embs_q.shape)
else:
    embs_q = dreams_embeddings(msdata_q)
    np.save(query_emb_path, embs_q)
    print("Computed & saved query embeddings:", embs_q.shape)

Computing DreaMS embedding: 100%|██████████| 1558/1558 [00:33<00:00, 45.98it/s]


Computed & saved query embeddings: (1558, 1024)


#### Now we compute pairwise similarity between our mass spectra and spectra from MassSpecGym

In [8]:
# 5) Compute all‐vs‐all cosine similarities -------------------------------
# returns an array of shape (n_query, n_library)
sims = cosine_similarity(embs_q, embs_lib)
print("Similarity matrix:", sims.shape)

Similarity matrix: (1558, 231104)


### Saving the results

In [9]:
# 6) Find top-5 matches for each query spectrum ---------------------------
k = 5
topk = np.argsort(sims, axis=1)[:, -k:][:, ::-1]
print(f"Top-{k} matches per query computed.")


Top-5 matches per query computed.


In [10]:
# 7) Build results table ---------------------------------------------------
rows = []
cos_sim = su.PeakListModifiedCosine()

for i_q, top_inds in enumerate(tqdm(topk, desc="Assembling results")):
    for rank, i_lib in enumerate(top_inds, start=1):
        rows.append({
            "query_index":                i_q,
            "rank":                       rank,
            "library_index":              int(i_lib),
            "library_SMILES":             msdata_lib.get_smiles(i_lib),
            "library_IDENTIFIER":         msdata_lib.get_values("IDENTIFIER", i_lib),
            "DreaMS_similarity":          float(sims[i_q, i_lib]),
            "Modified_cosine_similarity": cos_sim(
                spec1=msdata_q.get_spectra(i_q),
                prec_mz1=msdata_q.get_prec_mzs(i_q),
                spec2=msdata_lib.get_spectra(i_lib),
                prec_mz2=msdata_lib.get_prec_mzs(i_lib),
            )
        })

df_matches = pd.DataFrame(rows)
print("Results table shape:", df_matches.shape)

Assembling results: 100%|██████████| 1558/1558 [00:02<00:00, 612.86it/s] 


Results table shape: (7790, 7)


#### Saving result

In [11]:
# 8) Save results to CSV ---------------------------------------------------
out_csv = results_folder / "library_matching_results.csv"
df_matches.to_csv(out_csv, index=False)
print("Library matching results saved to:", out_csv.name)

Library matching results saved to: library_matching_results.csv
