In [4]:
#!/usr/bin/env python3
"""
Script to load the datamodule based on the given hydra config and input arguments.
"""
%load_ext autoreload
%autoreload 2
import argparse
import logging
import sys
from pathlib import Path
from typing import Optional
import os

import hydra
import hydra.core.global_hydra
import lightning.pytorch as pl
from omegaconf import DictConfig, OmegaConf

# Add the src directory to the Python path
#sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from lobster.data._molecule_improvement_datamodule import MoleculeImprovementLightningDataModule
from try_load_dataset import load_datamodule_from_config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
overrides = None
config_path = '../src/lobster/hydra_config/train_chembl'
datamodule, cfg = load_datamodule_from_config(config_path, overrides=overrides)

INFO:try_load_dataset:Configuration loaded:
INFO:try_load_dataset:dryrun: false
run_test: true
compile: false
seed: 42
logger:
  project: null
  name: null
  entity: null
  _target_: lightning.pytorch.loggers.WandbLogger
  save_dir: .
  offline: false
  group: null
  notes: null
  tags: null
paths:
  root_dir: first_run
  output_dir: ${paths.root_dir}/${paths.timestamp}
  timestamp: ${now:%Y-%m-%d}T${now:%H-%M-%S.%f}
data:
  _target_: lobster.data.ChEMBLLightningDataModule
  root: $DATA_DIR/chembl
  download: true
  lengths:
  - 0.9
  - 0.05
  - 0.05
  batch_size: 64
  num_workers: 1
  seed: 0
  transform_fn:
    _target_: lobster.transforms.TokenizerTransform
    tokenizer:
      _target_: lobster.tokenization.SmilesTokenizer2Fast
    padding: max_length
    truncation: true
    max_length: ${model.max_length}
model:
  _target_: lobster.model.LobsterPCLM2
  model_name: CLM_mini
  lr: 0.0001
  max_length: 1024
  ckpt_path: null
  num_training_steps: ${trainer.max_steps}
  num_warmup_st

config_dir: ../src/lobster/hydra_config
config_name: train_chembl
changed to config_dir ../src/lobster/hydra_config new_cwd /homefs/home/lawrenh6/lobster/src/lobster/hydra_config
hydra initialized


In [19]:
print("Preparing data...")
datamodule.prepare_data()

stage = "fit"
print(f"Setting up datamodule for stage: {stage}")
datamodule.setup(stage=stage)

Preparing data...
Setting up datamodule for stage: fit
Loaded 107922 pairs for split 'train' using utility 'gap'
After utility filtering (> 1): 107922 pairs
Loaded 107922 pairs for split 'val' using utility 'gap'
After utility filtering (> 1): 0 pairs
Loaded 107922 pairs for split 'test' using utility 'gap'
After utility filtering (> 1): 0 pairs


In [20]:
len(datamodule._train_dataset)

107922

In [22]:
type(datamodule._train_dataset[0])

tuple

In [24]:
datamodule._train_dataset[0][0] # why are there labels???

{'input_ids': tensor([[ 0, 53, 16,  ...,  1,  1,  1]]),
 'labels': tensor([[ 0, 53, 16,  ...,  1,  1,  1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}

In [25]:
datamodule._train_dataset[0][0]['input_ids']

tensor([[ 0, 53, 16,  ...,  1,  1,  1]])

In [26]:
len(datamodule._train_dataset[0][0]['input_ids'])

1

In [23]:
datamodule._train_dataset[0][1]

[]

In [10]:
overrides = None
chembl_config_path = '../src/lobster/hydra_config/train_chembl'
chembl_datamodule, chembl_cfg = load_datamodule_from_config(chembl_config_path, overrides=overrides)

print("Preparing data...")
chembl_datamodule.prepare_data()

stage = "fit"
print(f"Setting up datamodule for stage: {stage}")
chembl_datamodule.setup(stage=stage)

INFO:try_load_dataset:Configuration loaded:
INFO:try_load_dataset:dryrun: false
run_test: true
compile: false
seed: 42
logger:
  project: null
  name: null
  entity: null
  _target_: lightning.pytorch.loggers.WandbLogger
  save_dir: .
  offline: false
  group: null
  notes: null
  tags: null
paths:
  root_dir: first_run
  output_dir: ${paths.root_dir}/${paths.timestamp}
  timestamp: ${now:%Y-%m-%d}T${now:%H-%M-%S.%f}
data:
  _target_: lobster.data.ChEMBLLightningDataModule
  root: $DATA_DIR/chembl
  download: true
  lengths:
  - 0.9
  - 0.05
  - 0.05
  batch_size: 64
  num_workers: 1
  seed: 0
  transform_fn:
    _target_: lobster.transforms.TokenizerTransform
    tokenizer:
      _target_: lobster.tokenization.SmilesTokenizer2Fast
    padding: max_length
    truncation: true
    max_length: ${model.max_length}
model:
  _target_: lobster.model.LobsterPCLM2
  model_name: CLM_mini
  lr: 0.0001
  max_length: 1024
  ckpt_path: null
  num_training_steps: ${trainer.max_steps}
  num_warmup_st

config_dir: ../src/lobster/hydra_config
config_name: train_chembl
changed to config_dir ../src/lobster/hydra_config new_cwd /homefs/home/lawrenh6/lobster/src/lobster/hydra_config
hydra initialized


In [11]:
chembl_datamodule

<lobster.data._chembl_datamodule.ChEMBLLightningDataModule at 0x7efc38c9f830>

In [13]:
chembl_datamodule._train_dataset[0]

AttributeError: 'ChEMBLLightningDataModule' object has no attribute '_train_dataset'

Preparing data...


  self._data = pandas.read_csv(path, sep=None)


Setting up datamodule for stage: fit


In [15]:
len(chembl_datamodule._train_dataset)

1747265

In [31]:
1747265 % 64

1

In [34]:
64*15

960

In [16]:
chembl_datamodule._train_dataset[0] # here, there are no labels...hmm...

{'input_ids': tensor([[ 0,  9, 13,  ...,  1,  1,  1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}

In [35]:
chembl_datamodule._train_dataset.dataset.transform.tokenizer

SmilesTokenizer2Fast(name_or_path='', vocab_size=1226, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<eos>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<cls>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<sep>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),

In [25]:
[(chembl_datamodule._train_dataset.dataset.transform.tokenizer.decode(i), msk)  for i, msk in zip(chembl_datamodule._train_dataset[0]['input_ids'][0], chembl_datamodule._train_dataset[0]['attention_mask'][0])]





[('<cls>', tensor(1)),
 ('c', tensor(1)),
 ('1', tensor(1)),
 ('c', tensor(1)),
 ('c', tensor(1)),
 ('c', tensor(1)),
 ('(', tensor(1)),
 ('C', tensor(1)),
 ('N', tensor(1)),
 ('2', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('3', tensor(1)),
 ('(', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('2', tensor(1)),
 (')', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('3', tensor(1)),
 ('C', tensor(1)),
 ('N', tensor(1)),
 ('c', tensor(1)),
 ('2', tensor(1)),
 ('c', tensor(1)),
 ('c', tensor(1)),
 ('c', tensor(1)),
 ('c', tensor(1)),
 ('c', tensor(1)),
 ('2', tensor(1)),
 ('N', tensor(1)),
 ('2', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('C', tensor(1)),
 ('2', tensor(1)),
 (')', tensor(1)),
 ('c', tensor(1)),
 ('c', tensor(1)),
 ('1', tensor(1)),
 ('<eos>', tensor(1)),
 ('<pad>', tensor(0)),
 ('<pad>', tensor(0)),
 ('<pad>', tensor(0)),
 ('<pad>', tensor(0)),
 ('<pad>', tensor(0)),
 ('<pad>', tensor(0)),

In [26]:
train_dl = chembl_datamodule.train_dataloader()

In [28]:
for batch in train_dl:
    break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [30]:
batch['input_ids'].shape

torch.Size([64, 1, 1024])

In [39]:
for i in chembl_datamodule._train_dataset[0]['attention_mask'][0]:
    print(i)

tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)


In [42]:
import torch
torch.sum(chembl_datamodule._train_dataset[0]['attention_mask'][0] == 1)

tensor(45)

In [44]:
for i in chembl_datamodule._train_dataset[0]['input_ids'][0]:
    print(i)

tensor(0)
tensor(9)
tensor(13)
tensor(9)
tensor(9)
tensor(9)
tensor(10)
tensor(8)
tensor(16)
tensor(14)
tensor(8)
tensor(8)
tensor(8)
tensor(17)
tensor(10)
tensor(8)
tensor(8)
tensor(14)
tensor(11)
tensor(8)
tensor(8)
tensor(17)
tensor(8)
tensor(16)
tensor(9)
tensor(14)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(9)
tensor(14)
tensor(16)
tensor(14)
tensor(8)
tensor(8)
tensor(8)
tensor(8)
tensor(8)
tensor(14)
tensor(11)
tensor(9)
tensor(9)
tensor(13)
tensor(2)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
ten

In [46]:
chembl_datamodule._train_dataset[0]['input_ids'][0][0:46]

tensor([ 0,  9, 13,  9,  9,  9, 10,  8, 16, 14,  8,  8,  8, 17, 10,  8,  8, 14,
        11,  8,  8, 17,  8, 16,  9, 14,  9,  9,  9,  9,  9, 14, 16, 14,  8,  8,
         8,  8,  8, 14, 11,  9,  9, 13,  2,  1])