In [22]:
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))
from paths import PROJECT_ROOT

import pandas as pd
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from matchms import Spectrum

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

In [23]:
# 1) Paths 
csv_path    = PROJECT_ROOT / "data" / "molecules" / "Piper_data_smiles.csv"
mgf_in      = PROJECT_ROOT / "data" / "sirius.mgf"
mgf_matched = PROJECT_ROOT / "data" / "molecules" / "Piper_sirius_matched_annotated.mgf"
mgf_all     = PROJECT_ROOT / "data" / "molecules" / "Piper_sirius_all_annotated.mgf"

In [24]:
# 2) Load CSV and build lookup 
df = pd.read_csv(csv_path, index_col=0)
df["MZmine ID"] = df["MZmine ID"].astype(int)
lookup = df.set_index("MZmine ID")[["Name","SMILES"]].to_dict(orient="index")

In [25]:
# 3) Load all spectra from MGF 
spectra = list(load_from_mgf(str(mgf_in)))
print(f"Total spectra loaded from MGF: {len(spectra)}")

Total spectra loaded from MGF: 6732


In [26]:
matched_specs = []
all_specs     = []

for spec in spectra:
    # 1) Initialize all annotation fields to the literal string "None"
    spec.set("smiles",        "None")
    spec.set("compound_name", "None")
    spec.set("formula",       "None")
    spec.set("inchi",         "None")
    spec.set("inchikey",      "None")

    # 2) Pull feature_id
    feat = spec.metadata.get("feature_id")
    try:
        feat = int(feat)
    except Exception:
        feat = None

    # 3) If in CSV, overwrite with real values
    if feat in lookup:
        entry  = lookup[feat]
        smiles = entry["SMILES"]
        name   = entry["Name"]

        spec.set("smiles",        smiles)
        spec.set("compound_name", name)

        mol = Chem.MolFromSmiles(smiles)
        if mol:
            raw_formula   = rdMolDescriptors.CalcMolFormula(mol)
            clean_formula = raw_formula.rstrip("+-")
            spec.set("formula", clean_formula)

            spec.set("inchi",    Chem.MolToInchi(mol))
            spec.set("inchikey", Chem.MolToInchiKey(mol))

        matched_specs.append(spec)

    # 4) Always collect into the full list
    all_specs.append(spec)


In [27]:
# save both MGFs
save_as_mgf(matched_specs, str(mgf_matched))
print(f"Wrote {len(matched_specs)} matched spectra to {mgf_matched}")

save_as_mgf(all_specs, str(mgf_all))
print(f"Wrote {len(all_specs)} total spectra (with None for unmatched) to {mgf_all}")

Wrote 224 matched spectra to /Users/macbook/CODE/DreaMS_MIMB/data/molecules/Piper_sirius_matched_annotated.mgf
Wrote 6732 total spectra (with None for unmatched) to /Users/macbook/CODE/DreaMS_MIMB/data/molecules/Piper_sirius_all_annotated.mgf


In [28]:
# 6) Check coverage of CSV IDs with updated variable names 
csv_ids     = set(df["MZmine ID"].tolist())
matched_ids = {int(spec.metadata.get("feature_id")) for spec in matched_specs}

missing_ids = csv_ids - matched_ids

print(f"Unique CSV IDs:       {len(csv_ids)}")
print(f"Unique IDs matched:   {len(matched_ids)}")
print(f"CSV entries missing:  {len(missing_ids)}")
if missing_ids:
    print("Missing MZmine IDs:", sorted(missing_ids))
else:
    print("✅ All CSV entries found at least one spectrum.")

Unique CSV IDs:       23
Unique IDs matched:   23
CSV entries missing:  0
✅ All CSV entries found at least one spectrum.
