In [12]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
tqdm.pandas()
from matchms import Spectrum
from matchms.exporting import save_as_mgf
from pathlib import Path

In [18]:
df = pd.read_csv('../../data/data/MassSpecGym_with_test.tsv', sep='\t')
len(df)

231104

In [17]:
df = df[df['fold'] != 'test']
df.to_csv('../../data/data/MassSpecGym.tsv', sep='\t', index=False)

### Convert final dataset to `.mgf`

In [5]:
out_pth = Path('../../data/data/auxiliary/MassSpecGym.mgf')
# out_pth = Path('../../data/data/auxiliary/MassSpecGym_with_test.mgf')

spectra = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    spec = Spectrum(
        mz=row['mzs'],
        intensities=row['intensities'],
        metadata={k: v for k, v in row.items() if k not in ['mzs', 'intensities']}
    )
    spectra.append(spec)
save_as_mgf(spectra, str(out_pth))

100%|██████████| 213548/213548 [00:19<00:00, 10688.26it/s]


### Convert final dataset to `.ms` files
The cell below was reproduced from https://github.com/samgoldman97/mist/blob/4c23d34fc82425ad5474a53e10b4622dcdbca479/src/mist/utils/parse_utils.py#L71

In [6]:
from typing import List, Tuple
import numpy as np

def spec_to_ms_str(
    spec: List[Tuple[str, np.ndarray]], essential_keys: dict, comments: dict = {}
) -> str:
    """spec_to_ms_str.

    Turn spec ars and info dicts into str for output file


    Args:
        spec (List[Tuple[str, np.ndarray]]): spec
        essential_keys (dict): essential_keys
        comments (dict): comments

    Returns:
        str:
    """

    def pair_rows(rows):
        return "\n".join([f"{i} {j}" for i, j in rows])

    header = "\n".join(f">{k} {v}" for k, v in essential_keys.items())
    # comments = "\n".join(f"#{k} {v}" for k, v in essential_keys.items())
    spec_strs = [f">{name}\n{pair_rows(ar)}" for name, ar in spec]
    spec_str = "\n\n".join(spec_strs)
    output = f"{header}\n\n{spec_str}"
    return output

In [7]:
tgt_dir = Path('../../data/data/auxiliary')
(tgt_dir / 'MassSpecGym_ms_files' / 'spec_files').mkdir(exist_ok=True, parents=True)
(tgt_dir / 'MassSpecGym_ms_files' / 'splits').mkdir(exist_ok=True, parents=True)

In [9]:
for i, row in tqdm(df.iterrows()):
    spec = np.stack([
        row['mzs'],
        row['intensities']
    ]).T

    ms_str = spec_to_ms_str(
        [('ms2peaks', spec)],
        essential_keys={
            'compound': i,
            'formula': row['formula'],
            'parentmass': row['parent_mass'],
            'ionization': '[M + H]+' if row['adduct'] == '[M+H]+' else '[M + Na]+',
            'InChIKey': row['inchikey'],
        },
    )

    with open(tgt_dir / 'MassSpecGym_ms_files' / 'spec_files' / f"{i}.ms", "w") as f:
        f.write(ms_str)

213548it [00:40, 5226.96it/s]


In [10]:
df_split = df.reset_index()[['identifier', 'fold']]
df_split = df_split.rename(columns={'identifier': 'name'})
df_split.to_csv(tgt_dir / 'MassSpecGym_ms_files' / 'splits' / 'split.csv', index=False)

### Remove test SMILES from candidate lists

In [None]:
dev_dir = Path('/Users/roman/HuggingFaceHub/MassSpecGym_dev')
tgt_dir = Path('/Users/roman/HuggingFaceHub/MassSpecGym')
for cands_suffix in ['formula', 'mass']:
    with open(dev_dir / f'data/molecules/MassSpecGym_retrieval_candidates_{cands_suffix}.json', 'r') as f:
        cands = json.load(f)
    print(len(cands.keys()))
    smiles = set(df['smiles'])
    cands = {k: v for k, v in cands.items() if k in smiles}
    print(len(cands.keys()))
    with open(tgt_dir / f'data/molecules/MassSpecGym_retrieval_candidates_{cands_suffix}.json', 'w') as f:
        json.dump(cands, f)