In [1]:
from pathlib import Path
import torch
import pytorch_lightning as pl

# your package
from benchmark.utils import annotate_mgf_with_label
from benchmark.data.datasets import ChlorineDetectionDataset
from benchmark.data.data_module import BenchmarkDataModule
from benchmark.models.classifier import MLPClassifier
from benchmark.models.lit_module import LitClassifier

# transforms
from massspecgym.data.transforms import SpecBinner

/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/lightning_fabric/__init__.py:41: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.


In [2]:
# where your data lives
DATA_DIR = Path("../data/massspecgym")
ORIG_MGF = DATA_DIR / "MassSpecGym.mgf"
LABELED_MGF = DATA_DIR / "MassSpecGym_chlorine.mgf"

In [8]:
# define label: check 'FORMULA' metadata
label_fn = lambda md: "Cl" in md.get("formula", "")

# write out a new MGF with LABEL=<0.0|1.0> lines
annotate_mgf_with_label(ORIG_MGF, LABELED_MGF, label_fn)
print("Wrote:", LABELED_MGF.exists())

Wrote: True


In [3]:
# simple binning transform: 1 Da bins up to 1000
spec_transform = SpecBinner(max_mz=1000.0, bin_width=1.0)

ds = ChlorineDetectionDataset(
    pth=LABELED_MGF,
    spec_transform=spec_transform,
    dtype=torch.float32
)
print("Total spectra:", len(ds))
print(ds.metadata['label'].value_counts())

dm = BenchmarkDataModule(
    dataset=ds,
    batch_size=16,
    num_workers=0
)
dm.setup()  # splits into train/val/test

Total spectra: 231104
label
0.0    210799
1.0     20305
Name: count, dtype: int64


In [4]:
# Grab one training batch to inspect types & shapes
batch = next(iter(dm.train_dataloader()))
print("spec type:",  type(batch["spec"]))
print("spec shape: ", batch["spec"].shape)
print("label type:", type(batch["label"]))
print("label shape:", batch["label"].shape)

spec type: <class 'torch.Tensor'>
spec shape:  torch.Size([16, 1000])
label type: <class 'torch.Tensor'>
label shape: torch.Size([16])


In [5]:
# 1. MLP
input_dim = int(spec_transform.max_mz / spec_transform.bin_width)
mlp = MLPClassifier(input_dim=input_dim, hidden_dims=(32,), dropout=0.1)

# 2. Lightning wrapper
lit = LitClassifier(mlp, lr=1e-3)

In [6]:
# trainer = pl.Trainer(
#     max_epochs=3,
#     accelerator="cpu",  # or "gpu", devices=1
#     log_every_n_steps=10,
# )

trainer = pl.Trainer(
    max_epochs=1,
    limit_train_batches=5,     # run only 5 training batches
    limit_val_batches=3,       # run only 3 validation batches
    limit_test_batches=3,      # run only 3 test batches
    accelerator="cpu",         # or "gpu"
    devices=1,
)
# fit
trainer.fit(lit, datamodule=dm)

# final test
trainer.test(lit, datamodule=dm)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name      | Type           | Params
---------------------------------------------
0

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 16. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/macbook/UTILS/anaconda3/envs/dream

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc                    0.0
        test_loss           0.7641429901123047
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.7641429901123047, 'test_acc': 0.0}]

In [7]:
# grab one batch from test loader
batch = next(iter(dm.test_dataloader()))
specs, labels = batch["spec"], batch["label"]
logits = lit(specs)
preds = torch.sigmoid(logits)
print("True:", labels[:8].tolist())
print("Pred:", preds[:8].detach().round().tolist())

True: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [11]:
batch = next(iter(dm.test_dataloader()))

In [12]:
batch

{'spec': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'label': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'identifier': ['MassSpecGymID0000201',
  'MassSpecGymID0000202',
  'MassSpecGymID0000203',
  'MassSpecGymID0000204',
  'MassSpecGymID0000205',
  'MassSpecGymID0000206',
  'MassSpecGymID0000207',
  'MassSpecGymID0000208',
  'MassSpecGymID0000209',
  'MassSpecGymID0000210',
  'MassSpecGymID0000211',
  'MassSpecGymID0000212',
  'MassSpecGymID0000213',
  'MassSpecGymID0000214',
  'MassSpecGymID0000215',
  'MassSpecGymID0000216']}

## DreaMS model

In [3]:
from benchmark.models.lit_dreams_module import LitDreamsClassifier
from benchmark.data.data_module import BenchmarkDataModule
from massspecgym.data.transforms import SpecTokenizer
import pytorch_lightning as pl

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [4]:
# 1) Instantiate your Lightning module, pointing at the fine-tuned checkpoint you care about:
lit = LitDreamsClassifier(
    ckpt_path="/Users/macbook/CODE/DreaMS_MIMB/data/model_checkpoints/ssl_model.ckpt",
    n_highest_peaks=60,
    lr=1e-4,
    dropout=0.1,
    train_encoder=True  # if you want to fine-tune the whole encoder
)


In [5]:
# pick N peaks (e.g. 60), yields shape [batch, 61, 2] (including precursor)
spec_transform = SpecTokenizer(n_peaks=60)

ds = ChlorineDetectionDataset(
    pth=LABELED_MGF,
    spec_transform=spec_transform,
    dtype=torch.float32
)

In [6]:
# 2) Prepare your BenchmarkDataModule as before
dm = BenchmarkDataModule(dataset=ds, batch_size=16, num_workers=0)
dm.setup()

In [7]:
batch = next(iter(dm.test_dataloader()))

In [8]:
batch

{'spec': tensor([[[8.0640e+02, 1.1000e+00],
          [8.3049e+01, 1.6216e-01],
          [1.2304e+02, 4.7047e-02],
          ...,
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00]],
 
         [[8.0640e+02, 1.1000e+00],
          [3.8418e+02, 1.7918e-01],
          [5.4526e+02, 5.0050e-02],
          ...,
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00]],
 
         [[7.8442e+02, 1.1000e+00],
          [1.3410e+02, 1.0000e+00],
          [2.3415e+02, 7.9079e-02],
          ...,
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00]],
 
         ...,
 
         [[7.8442e+02, 1.1000e+00],
          [1.3410e+02, 4.5377e-01],
          [2.3415e+02, 9.0367e-02],
          ...,
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00]],
 
         [[7.8442e+02, 1.1000e+00],
    

In [9]:
# 3) Trainer configuration
# trainer = pl.Trainer(
#     max_epochs=5,
#     accelerator="gpu",    # or "mps"
#     devices=1,
#     num_sanity_val_steps=0,
# )

trainer = pl.Trainer(
    max_epochs=1,
    limit_train_batches=5,     # run only 5 training batches
    limit_val_batches=3,       # run only 3 validation batches
    limit_test_batches=3,      # run only 3 test batches
    accelerator="cpu",         # or "gpu"
    devices=1,
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [10]:
# 4) Train
trainer.fit(lit, datamodule=dm)


  | Name      | Type             | Params
-----------------------------------------------
0 | model     | DreamsClassifier | 95.6 M
1 | train_acc | BinaryAccuracy   | 0     
2 | val_acc   | BinaryAccuracy   | 0     
3 | val_auc   | BinaryAUROC      | 0     
-----------------------------------------------
95.6 M    Trainable params
0         Non-trainable params
95.6 M    Total params
382.202   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [11]:
trainer.test(lit, datamodule=dm)

/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc                    1.0
        test_loss           0.15856797993183136
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.15856797993183136, 'test_acc': 1.0}]

In [12]:
# 5) Save checkpoint of your fine-tuned model
trainer.save_checkpoint("dreams_chlorine_finetuned.ckpt")

## Load predtrained DreaMS

In [14]:
# 1) Reload your tuned model (you already did this):
lit2 = LitDreamsClassifier.load_from_checkpoint("dreams_chlorine_finetuned.ckpt")
lit2.eval()

LitDreamsClassifier(
  (model): DreamsClassifier(
    (spec_encoder): DreaMS(
      (fourier_enc): FourierFeatures()
      (ff_fourier): FeedForward(
        (ff): Sequential(
          (0): Linear(in_features=11994, out_features=512, bias=True)
          (1): Dropout(p=0.1, inplace=False)
          (2): ReLU()
          (3): Linear(in_features=512, out_features=512, bias=True)
          (4): Dropout(p=0.1, inplace=False)
          (5): ReLU()
          (6): Linear(in_features=512, out_features=512, bias=True)
          (7): Dropout(p=0.1, inplace=False)
          (8): ReLU()
          (9): Linear(in_features=512, out_features=512, bias=True)
          (10): Dropout(p=0.1, inplace=False)
          (11): ReLU()
          (12): Linear(in_features=512, out_features=980, bias=True)
          (13): ReLU()
        )
      )
      (ff_peak): FeedForward(
        (ff): Sequential(
          (0): Linear(in_features=2, out_features=44, bias=True)
          (1): ReLU()
        )
      )
      (tr

In [15]:
# 2) (Re)create a Trainer for testing:
test_trainer = pl.Trainer(accelerator="cpu", devices=1)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


In [16]:
# 3) Run on your existing BenchmarkDataModule:
test_results = test_trainer.test(lit2, datamodule=dm)

/Users/macbook/UTILS/anaconda3/envs/dreams_mimb/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.8043404221534729
        test_loss           0.42470064759254456
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [17]:
print(test_results)

[{'test_loss': 0.42470064759254456, 'test_acc': 0.8043404221534729}]
