In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from matchms import Spectrum
from matchms.exporting import save_as_mgf
from pathlib import Path
import massspecgym.utils as utils

In [2]:
in_pth = Path('../../data/MassSpecGym_with_test/MassSpecGym.tsv')
df = pd.read_csv(in_pth, sep='\t')
len(df)

231104

### Convert final dataset to `.mgf`

In [10]:
out_pth = in_pth.with_suffix('.mgf')

spectra = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    spec = Spectrum(
        mz=utils.parse_spec_array(row['mzs']),
        intensities=utils.parse_spec_array(row['intensities']),
        metadata={k: v for k, v in row.items() if k not in ['mzs', 'intensities'] and v is not np.nan}
    )
    spectra.append(spec)
save_as_mgf(spectra, str(out_pth))

100%|██████████| 231104/231104 [00:25<00:00, 8948.01it/s] 


### Convert final dataset to `.ms` files
The cell below was reproduced from https://github.com/samgoldman97/mist/blob/4c23d34fc82425ad5474a53e10b4622dcdbca479/src/mist/utils/parse_utils.py#L71

In [3]:
from typing import List, Tuple
import numpy as np

def spec_to_ms_str(
    spec: List[Tuple[str, np.ndarray]], essential_keys: dict, comments: dict = {}
) -> str:
    """spec_to_ms_str.

    Turn spec ars and info dicts into str for output file


    Args:
        spec (List[Tuple[str, np.ndarray]]): spec
        essential_keys (dict): essential_keys
        comments (dict): comments

    Returns:
        str:
    """

    def pair_rows(rows):
        return "\n".join([f"{i} {j}" for i, j in rows])

    header = "\n".join(f">{k} {v}" for k, v in essential_keys.items())
    # comments = "\n".join(f"#{k} {v}" for k, v in essential_keys.items())
    spec_strs = [f">{name}\n{pair_rows(ar)}" for name, ar in spec]
    spec_str = "\n\n".join(spec_strs)
    output = f"{header}\n\n{spec_str}"
    return output

In [4]:
tgt_dir = Path('../../data/MassSpecGym_with_test/auxiliary')
(tgt_dir / 'MassSpecGym_ms_files' / 'spec_files').mkdir(exist_ok=True, parents=True)
(tgt_dir / 'MassSpecGym_ms_files' / 'splits').mkdir(exist_ok=True, parents=True)

In [5]:
for i, row in tqdm(df.iterrows()):
    spec = np.stack([
        utils.parse_spec_array(row['mzs']),
        utils.parse_spec_array(row['intensities'])
    ]).T

    essential_keys = {
        'compound': row['identifier'],
        'parentmass': row['parent_mass'],
        'ionization': '[M + H]+' if row['adduct'] == '[M+H]+' else '[M + Na]+',
        'energy': row['collision_energy'],
        'instrument': row['instrument_type']
    }

    # if row['fold'] != 'test':
    essential_keys.update({
        'formula': row['formula'],
        'InChIKey': row['inchikey'],
        'smiles': row['smiles']
    })

    ms_str = spec_to_ms_str(
        [('ms2peaks', spec)],
        essential_keys=essential_keys,
    )

    with open(tgt_dir / 'MassSpecGym_ms_files' / 'spec_files' / f"{row['identifier']}.ms", "w") as f:
        f.write(ms_str)

231104it [00:50, 4586.37it/s]


In [6]:
df_split = df.reset_index()[['identifier', 'fold']]
df_split = df_split.rename(columns={'identifier': 'name'})
df_split.to_csv(tgt_dir / 'MassSpecGym_ms_files' / 'splits' / 'split.csv', index=False)