In [4]:
import pandas as pd

import pytorch_lightning as pl
from pytorch_lightning import Trainer

from massspecgym.datasets import MassSpecDataset, MassSpecDataModule
from massspecgym.transforms import SpecTokenizer, MolFingerprinter
from massspecgym.models.retrieval.deepsets import DeepSetsRetrieval
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Train and evaluate a simple Deep Sets model on the fingerprint retrieval task

In [5]:
# Load dataset
dataset = MassSpecDataset(
    mgf_pth='../data/MassSpecGym_labeled_data.mgf',
    spec_preproc=SpecTokenizer(n_peaks=60),
    mol_preproc=MolFingerprinter()
)

# Init data module
data_module = MassSpecDataModule(
    dataset=dataset,
    split_pth='../data/MassSpecGym_labeled_data_split.tsv',
    batch_size=2
)

# Init model
model = DeepSetsRetrieval()

# Init logger
# You may need to run wandb init first to use the wandb logger
# Alternatively set logger = None in Trainer below not to use wandb
project = 'MassSpecGymRetieval'
name = 'DeepSets'
logger = pl.loggers.WandbLogger(
    project=project,
    name=name,
    tags=[],
    log_model=False,
)

# Init trainer
trainer = Trainer(
    accelerator='cpu',
    max_epochs=200,
    logger=logger,
    log_every_n_steps=1
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/anton/miniconda3/envs/massspecgym/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


In [6]:
# Train
trainer.fit(model, datamodule=data_module)