In [1]:
import pandas as pd

import pytorch_lightning as pl
from pytorch_lightning import Trainer

from massspecgym.datasets import MassSpecDataModule
from massspecgym.preprocessors import SpecTokenizer, MolFingerprinter
from massspecgym.models.retrieval.deepsets import DeepSetsRetrieval
%load_ext autoreload
%autoreload 2

## Train and evaluate a simple Deep Sets model on the fingerprint retrieval task

In [2]:
# Load and split data
spec_preproc = SpecTokenizer(n_peaks=60)
mol_preproc = MolFingerprinter()
data_module = MassSpecDataModule(
    mgf_pth='../data/MassSpecGym_labeled_data.mgf',
    spec_preproc=spec_preproc,
    mol_preproc=mol_preproc,
    split_pth='../data/MassSpecGym_labeled_data_split.tsv',
    batch_size=2
)

# Init model
model = DeepSetsRetrieval()

# Init logger
# You may need to run wandb init first to use the wandb logger
# Alternatively set logger = None in Trainer below not to use wandb
project = 'MassSpecGymRetieval'
name = 'DeepSets'
logger = pl.loggers.WandbLogger(
    project=project,
    name=name,
    tags=[],
    log_model=False,
)

# Init trainer
trainer = Trainer(
    accelerator='cpu',
    max_epochs=200,
    logger=logger,
    log_every_n_steps=1
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/roman/miniconda/envs/massspecgym/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


In [3]:
# Train
trainer.fit(model, datamodule=data_module)