In [62]:
%load_ext autoreload
%autoreload 2
    
# Lobster Model Inference Notebook
import torch
from lobster.model import LobsterPCLM2, LobsterPCLM
import lobster
from try_load_dataset import find_latest_ckpt_file, load_datamodule_from_config, deep_compare
import os

# Define the test input molecule
test_mol = "CN(C)C[C@@H](O)[C@@H](c1ccccc1)c1ccc(Cl)cc1"

# Determine the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
from atomic_datasets.utils import is_molecule_sane
#import qm9_pair_gen
import time
import matplotlib.pyplot as plt
from rdkit import Chem
import numpy as np


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load datamodule, which includes the tokenizer

In [3]:
overrides = None
config_path = '../src/lobster/hydra_config/train_molecule_improvement.yaml' #train_chembl.yaml'
datamodule, cfg, transform_fn = load_datamodule_from_config(config_path, overrides=overrides)

print("Preparing data...")
datamodule.prepare_data()

stage = "fit"
print(f"Setting up datamodule for stage: {stage}")
datamodule.setup(stage=stage)

config_dir: ../src/lobster/hydra_config
config_name: train_molecule_improvement
original_cwd /homefs/home/lawrenh6/lobster/notebooks
hydra initialized


INFO:try_load_dataset:Configuration loaded:
INFO:try_load_dataset:dryrun: false
run_test: null
compile: false
seed: 42
logger:
  project: null
  name: null
  entity: null
  _target_: lightning.pytorch.loggers.WandbLogger
  save_dir: .
  offline: false
  group: null
  notes: null
  tags: null
paths:
  root_dir: first_run
  output_dir: ${paths.root_dir}/${paths.timestamp}
  timestamp: ${now:%Y-%m-%d}T${now:%H-%M-%S.%f}
data:
  _target_: lobster.data._molecule_improvement_datamodule.MoleculeImprovementLightningDataModule
  root: /data/lawrenh6/cache/test_new_code/qm9_pairs_per_mol_5_full
  train_pair_filename: pairs_train.parquet
  val_pair_filename: pairs_val.parquet
  test_pair_filename: pairs_test.parquet
  utility_key: gap
  delta: None
  epsilon: 0.001
  batch_size: 64
  shuffle: true
  num_workers: 4
  pin_memory: true
  drop_last: true
  max_train_samples: null
  transform_fn:
    _target_: lobster.transforms.TokenizerTransform
    tokenizer:
      _target_: lobster.tokenization.Sm

Preparing data...
Setting up datamodule for stage: fit
Loaded 5480540 pairs for split 'train' using utility 'gap'
After utility filtering (> 0.001): 5204656 pairs
Loaded 87450 pairs for split 'val' using utility 'gap'
After utility filtering (> 0.001): 897 pairs
Loaded 64375 pairs for split 'test' using utility 'gap'
After utility filtering (> 0.001): 688 pairs


# Check out the train, val, and test sets

In [70]:
len(datamodule._train_dataset)

def check_out_dset(dset):
    print(f'Length: {len(dset)}\n')
    tokenizer = dset.transform.tokenizer
    num_check = 4
    inds = np.random.choice(range(10), size=3, replace=False)
    for ind in inds:
        input_ids = dset[ind]['input_ids']
        mask = dset[ind]['attention_mask']
        to_decode = input_ids[mask != 0]
        decoded = tokenizer.decode(to_decode.tolist())
        print(f'Decoded {ind}: {decoded} \n')

dsets = {'train': datamodule._train_dataset, 'val': datamodule._val_dataset, 'test': datamodule._test_dataset}
for ky, dset in dsets.items():
    print(f'\n\n{ky}\n\n')
    check_out_dset(dset)



train


Length: 5204656

Decoded 7: <cls> [H] C 1 ( [H] ) C ( [H] ) ( [H] ) C ( [H] ) ( [H] ) C ( [H] ) ( [H] ) C ( [H] ) ( [H] ) C 1 ( [H] ) [H] <sep> [H] C ( [H] ) ( [H] ) [H] <eos> 

Decoded 8: <cls> [H] [C@] 1 2 [C@] 3 ( [H] ) [C@@] 1 ( [H] ) [C@@] 1 ( [H] ) [C@] 3 ( [H] ) [C@@] 2 1 [H] <sep> [H] C ( [H] ) ( [H] ) [H] <eos> 

Decoded 1: <cls> [H] c 1 c ( [H] ) c ( [H] ) c ( [H] ) c ( [H] ) c 1 [H] <sep> [H] C ( [H] ) ( [H] ) [H] <eos> 



val


Length: 897

Decoded 2: <cls> [H] C ( [H] ) ( [H] ) [C@] 1 2 [C@] 3 ( [H] ) [C@@] 1 ( [H] ) [C@@] 1 ( [H] ) [C@] 3 ( [H] ) [C@] 1 2 C ( [H] ) ( [H] ) [H] <sep> [H] C ( [H] ) ( [H] ) [H] <eos> 

Decoded 4: <cls> [H] C ( [H] ) ( [H] ) C ( [H] ) ( [H] ) [H] <sep> [H] C ( [H] ) ( [H] ) [H] <eos> 

Decoded 1: <cls> [H] c 1 c ( [H] ) c ( [H] ) c ( [H] ) c ( [H] ) c 1 [H] <sep> [H] C ( [H] ) ( [H] ) [H] <eos> 



test


Length: 688

Decoded 6: <cls> [H] C 1 ( [H] ) C ( [H] ) ( [H] ) C ( [H] ) ( [H] ) C ( [H] ) ( [H] ) C 1 ( [H] ) [H] <sep> [H] C 

In [67]:

np.random.choice(range(10), size=3, replace=False)

array([1, 3, 2])

In [66]:
datamodule._train_dataset[100]['input_ids'].tolist()[0]
input_ids = datamodule._train_dataset[100]['input_ids']
mask = datamodule._train_dataset[100]['attention_mask']
input_ids[mask != 0]

tensor([   0,   53,    8,   13,   10,   53,   11,   12,    8,   10,   53,   11,
          10,   53,   11,   34,   14,   10,   53,   11, 1184,   17,   35,   21,
          10,   53,   11,   34,   14,   10,   53,   11,   35,   21,   10,   53,
          11,   34,   17,   13,   53,    5,   53,   16,   10,   53,   11,   53,
           2])

In [52]:
datamodule._train_dataset[100]['input_ids'].tolist()[0]

[0,
 53,
 8,
 13,
 10,
 53,
 11,
 12,
 8,
 10,
 53,
 11,
 10,
 53,
 11,
 34,
 14,
 10,
 53,
 11,
 1184,
 17,
 35,
 21,
 10,
 53,
 11,
 34,
 14,
 10,
 53,
 11,
 35,
 21,
 10,
 53,
 11,
 34,
 17,
 13,
 53,
 5,
 53,
 16,
 10,
 53,
 11,
 53,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1

In [40]:
tok = datamodule._train_dataset.transform.tokenizer

In [53]:
tok.decode(datamodule._train_dataset[100]['input_ids'].tolist()[0])

'<cls> [H] C 1 ( [H] ) O C ( [H] ) ( [H] ) [C@@] 2 ( [H] ) [N@@H+] 3 [C@] 4 ( [H] ) [C@@] 2 ( [H] ) [C@] 4 ( [H] ) [C@@] 3 1 [H] <sep> [H] N ( [H] ) [H] <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <

In [269]:
cfg.model

{'_target_': 'lobster.model.LobsterPCLM2', 'model_name': 'CLM_mini', 'lr': 0.0001, 'max_length': 1024, 'ckpt_path': None, 'num_training_steps': '${trainer.max_steps}', 'num_warmup_steps': 1000, 'tokenizer_dir': 'garbage_test', 'transform_fn': '${data.transform_fn}', 'model_kwargs': {'embedding_layer': 'linear_pos', 'hidden_act': 'gelu'}, 'scheduler_kwargs': None}

# Load the checkpoint

In [71]:
# Load the LobsterPMLM model

ckpt_path = find_latest_ckpt_file('/data/lawrenh6/lobster_runs/val_initialized_propen_4', use_val_loss=True) # serious_chembl_large2
print(ckpt_path)
#ckpt_path = "/homefs/home/lawrenh6/lobster/first_run/2025-07-14T17-49-53.473790/last.ckpt"
#ckpt_path = "/data/lawrenh6/lobster_runs/ignore/2025-08-01T21-34-23.837619/last.ckpt"
#ckpt_path = "/data/lawrenh6/lobster_runs/ignore/2025-08-04T18-41-43.437182/last.ckpt"

# oops, should be getting transform_fn from elsewhere??
model = LobsterPCLM2("CLM_150M", transform_fn=transform_fn).to(device) # CLM_mini # CLM_150M  # CLM_mini
model.eval()

ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
model.load_state_dict(ckpt['state_dict'])


/data/lawrenh6/lobster_runs/val_initialized_propen_4/2025-08-12T06-51-27.224517/epoch=0-step=20330-val_loss=0.5613.ckpt


<All keys matched successfully>

In [72]:
print(ckpt_path)

/data/lawrenh6/lobster_runs/val_initialized_propen_4/2025-08-12T06-51-27.224517/epoch=0-step=20330-val_loss=0.5613.ckpt


In [6]:
model.tokenizer._tokenizer.post_processor = None  # changes by reference, seemingly -- so 

In [7]:
model.tokenizer.decode(model.tokenizer.encode("<cls>"))

'<cls>'

In [8]:
model.num_trainable_parameters

155572848

In [73]:
# do we need to add cls token or eos token?? also does it make sense to use cls token?
model.eval()
seed_seq = "<cls>"
print('Tokenization of seed_seq:')
print(model.tokenizer.encode(seed_seq))
print(model.tokenizer.decode(model.tokenizer.encode(seed_seq)))
out = model.sample(seed_seq=seed_seq, temperature=0.95, max_length=300) #CN(C)C[C@@H](O)[C@@H](c1") #, max_length=256) # CN(C)C[C@@H](O)[C@@H](c1
# how to add start token to this??
out[0]

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Tokenization of seed_seq:
[0]
<cls>


'<cls>[H] O C 1 ( [H] ) C ( = O ) C ( [H] ) ( [H] ) C ( [H] ) ( [H] ) C 1 = O [H] O [C@] 1 2 C ( [H] ) ( [H] ) C ( = O ) [C@@] 1 ( [H] ) [N@H+] 1 C ( [H] ) ( [H] ) [C@] 1 2 [H]'

In [349]:
print(model.tokenizer.decode(model.tokenizer.encode(f"<cls>{test_mol}")))

<cls> C N ( C ) C [C@@H] ( O ) [C@@H] ( c 1 c c c c c 1 ) c 1 c c c ( Cl ) c c 1


In [350]:
model.model.generate(torch.tensor(model.tokenizer.encode(f"<cls>{test_mol}")).reshape(1,-1).to(device))

tensor([[ 0,  8, 16, 10,  8, 11,  8, 20, 10, 12, 11, 20, 10,  9, 13,  9,  9,  9,
          9,  9, 13, 11,  9, 13,  9,  9,  9, 10, 26, 11,  9,  9, 13,  2]],
       device='cuda:0')

In [11]:
test_mol

'CN(C)C[C@@H](O)[C@@H](c1ccccc1)c1ccc(Cl)cc1'

In [28]:
input_ids = torch.tensor(model.tokenizer.encode(f"<cls>{test_mol}")).reshape(1,-1).to(device)
print(input_ids)
generated = model.model.generate(input_ids)
print(generated)
print(model.tokenizer.decode([int(g) for g in generated.reshape(-1)]))

temp = model.tokenizer.decode([int(g) for g in generated.reshape(-1)])
temp = temp.split(' ')
seps = [i for (i, val) in enumerate(temp) if val == '<sep>']
print(seps)


tensor([[ 0,  8, 16, 10,  8, 11,  8, 20, 10, 12, 11, 20, 10,  9, 13,  9,  9,  9,
          9,  9, 13, 11,  9, 13,  9,  9,  9, 10, 26, 11,  9,  9, 13]],
       device='cuda:0')
tensor([[ 0,  8, 16, 10,  8, 11,  8, 20, 10, 12, 11, 20, 10,  9, 13,  9,  9,  9,
          9,  9, 13, 11,  9, 13,  9,  9,  9, 10, 26, 11,  9,  9, 13, 53,  5, 53,
          8, 10, 15, 12, 11, 16, 10, 53, 11,  8, 10, 53, 11, 10, 53, 11,  8]],
       device='cuda:0')
<cls> C N ( C ) C [C@@H] ( O ) [C@@H] ( c 1 c c c c c 1 ) c 1 c c c ( Cl ) c c 1 [H] <sep> [H] C ( = O ) N ( [H] ) C ( [H] ) ( [H] ) C
[34]


In [343]:
type(generated)

torch.Tensor

In [5]:
'a | b  c d e | c'.split('|')[0].split(' ')

['a', '']

In [341]:
[int(g) for g in generated]

ValueError: only one element tensors can be converted to Python scalars

In [16]:
# looped test for non-paired dataset
num_samples = 1 #50 #300 #300 #500
num_sane = 0.0
seed_seq = "<cls>" #<cls>CN(C)C[C@@H](O)" # no cls, just to see
print('Tokenization of seed_seq:')
print(model.tokenizer.encode(seed_seq))
print(model.tokenizer.decode(model.tokenizer.encode(seed_seq)))

import time
start = time.time()
all_smiles = []
is_sane = []
for i in range(num_samples):
    out = model.sample(seed_seq=seed_seq, temperature=0.95, max_length=300)
    temp = out[0].split(" ")
    temp[0] = temp[0][5:]
    smiles = ''.join(temp)
    all_smiles.append(smiles)
    mol = Chem.MolFromSmiles(smiles)
    try:
        res = is_molecule_sane(mol)
        is_sane.append(res)
        if res:
            num_sane += 1
    except Exception as e:
        1+1; #print(f'not sane due to {e}')
    if i % 20 == 0:
        print(f'Fraction sane so far: {num_sane / (i+1)}, {num_sane} out of {(i+1)}')
print(f'Final fraction sane: {num_sane / num_samples}, {num_sane} out of {num_samples}')
end = time.time()


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Tokenization of seed_seq:
[0]
<cls>
Fraction sane so far: 0.0, 0.0 out of 1
Final fraction sane: 0.0, 0.0 out of 1


[17:36:06] Explicit valence for atom # 9 H, 2, is greater than permitted


In [None]:
# what about conditioning with seed seq?

In [1]:
tokenizer

NameError: name 'tokenizer' is not defined

In [31]:
# looped test for paired dataset
# looped test
num_samples = 50 #300 #300 #500
num_sane = 0.0
seed_seq = "<cls>" #<cls>CN(C)C[C@@H](O)" # no cls, just to see
print('Tokenization of seed_seq:')
print(model.tokenizer.encode(seed_seq))
print(model.tokenizer.decode(model.tokenizer.encode(seed_seq)))

import time
start = time.time()
all_smiles1, all_smiles2 = [], []
is_sane1 = []
is_sane2 = []
num_sane1, num_sane2 = 0, 0
for i in range(num_samples):
    out = model.sample(seed_seq=seed_seq, temperature=0.95, max_length=300)
    temp = out[0].split(" ")
    temp[0] = temp[0][5:]
    seps = [i for (i, val) in enumerate(temp) if val == '<sep>']
    if len(seps) == 0:
        print('no separator found')
        continue
    else:
        print('separator found')
    smiles1 = ''.join(temp[:seps[0]])
    smiles2 = ''.join(temp[seps[0]:])
    print('smiles1', smiles1, 'smiles2', smiles2)
    all_smiles1.append(smiles1)
    all_smiles2.append(smiles2)
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    try:
        res = is_molecule_sane(mol1)
        is_sane1.append(res)
        if res:
            num_sane1 += 1
    except Exception as e:
        1+1; #print(f'not sane due to {e}')
    try:
        res = is_molecule_sane(mol2)
        is_sane2.append(res)
        if res:
            num_sane2 += 1
    except Exception as e:
        1+1; #print(f'not sane due to {e}')
    if i % 20 == 0:
        print(f'Fraction sane so far: {num_sane / (i+1)}, {num_sane} out of {(i+1)}')
print(f'Final fraction sane: {num_sane / num_samples}, {num_sane} out of {num_samples}')
end = time.time()


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Tokenization of seed_seq:
[0]
<cls>


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


no separator found


KeyboardInterrupt: 

In [17]:
out

['<cls>[H] O N C 1 = C ( [H] ) [C-] ( [H] ) [C@@] 2 ( C ( [H] ) ( [H] ) [H] ) C ( [H] ) ( [H] ) [C@@] 1 2 [H] [H] O N [C@@H] 1 [C@@] ( [H] ) ( C ( [H] ) ( [H] ) [H] ) [C@@] 2 ( [H] ) C ( [H] ) ( [H] ) [C@@] 1 2 C ( [H] ) ( [H] ) [H]']

In [27]:
seps = [i for (i, val) in enumerate(temp) if val == '<sep>']
print(seps)

[]


In [29]:
len(seps)

1

In [362]:
print(f'Final fraction sane: {num_sane / num_samples}, {num_sane} out of {num_samples}')
print(f'Elapsed time: {((end - start)/ 60.0):.2f} min, or {((end - start)/ (num_samples)):.4f} sec per sample')

Final fraction sane: 0.84, 42.0 out of 50
Elapsed time: 0.67 min, or 0.7996 sec per sample


In [363]:
all_smiles

['CCOC(=O)Nc1sc2c(c1C#N)CCC(C)C2',
 'CN1C(=O)CS/C=C/COc2cc(N3CCOCC3)c(N3CCOCC3)cc21',
 'O=C(Nc1nc2ccccc2c(=O)s1)Nc1ccccc1F',
 'COc1ccc(CCNc2ccnc3oc4ccccc4c(=O)c23)cc1OC',
 'COc1cc2ncnc(NCc3ccc(C)o3)c2cc1OC',
 'COc1ccc(NC(=O)[C@@H]2CCCCNC(=O)CCCCCCC(=O)N[C@H](C(=O)N[C@@H](Cc3ccccc3)C(=O)NC(Cc3ccccc3)C(N)=O)C(C)C)cc1',
 'CC(F)(F)c1cn[nH]c1C1CCN(c2cc(N)ncn2)CC1',
 'CC(=O)NC1(c2cccc(C(F)(F)F)c2)CCN(c2ncnc3c2cc(C)n3C)CC1',
 'CCCCN(CCCC)c1nccc2[nH]c3ccccc3c12',
 'O=C(CCN1CCCCC1)Nc1nncs1',
 'CN1[C@H](C(=O)NCc2ccc(F)c(F)c2)CCS1(=O)=O',
 'CN(Cc1cnc2nc(N)nc(N)c2n1)c1cccCCC(=O)NC(CCC(=O)N[C@@H](CCC(=O)N[C@@H](CCC(=O)N[C@@H](CCC(=O)N[C@@H](Cc2ccccc2F)C(=O)OC(C)(C)C)C(=O)O)C(=O)O)C(=O)O',
 'NC(=O)NC(=O)CCC1C2CC3OC(=O)C1C3C2',
 'O=S(=O)(c1cccc(Oc2ncc(Cl)cc2NS(=O)(=O)C(F)(F)F)c1)N1CCc2ccccc2C1',
 'COCC(=N)Nc1nnc(SCC(=O)Nc2c(C)cccc2C)s1.Cl',
 'CN[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)Nc1nccc2ccccc12)C(C)C.O=CO',
 'O=C1C=C(/C=C/c2ccccc2)O[C@H](COCc2ccccc2)C1',
 'C[S+]([O-])c1nc(-c2ccccc2)c(Cc2cc

In [314]:
tokenizer.cls_token

'<cls>'

In [316]:
tokenizer.sep_token

'<sep>'

# Check tokenizers: look at vocabulary, compare tokenizers, test

In [175]:
mask = datamodule._train_dataset[-1]['attention_mask'] != 0
' '.join([tokenizer.decode(i) for i in datamodule._train_dataset[-1]['input_ids'][mask]])
# note: the mask mlay be very small piece of data, which indicates should change padding

'<cls> [H] C 1 ( [H] ) [N@H+] 2 [C@@] 3 ( [H] ) [C@@] 1 4 O [C@@] 1 ( [H] ) [C@@] ( [H] ) ( [C@] 2 4 [H] ) [C@] 1 3 [H] <sep> [H] C 1 ( [H] ) [C@@] 2 ( [H] ) [C@@] 3 ( [H] ) [C@@] 1 4 O [C@] 1 ( [H] ) [C@@] 3 ( [H] ) [C@] 1 ( [H] ) [C@] 2 4 [H] <eos>'

In [157]:
#tokenizer.convert_ids_to_tokens(out['input_ids'])

In [None]:
tokenizer = model._transform_fn.tokenizer
tokenizer2 = datamodule._train_dataset.transform.tokenizer
deep_compare(tokenizer2, tokenizer, fail_fast=False)

In [50]:
#model.tokenizer._tokenizer.post_processor=None

In [158]:
#ckpt['hyper_parameters']

# look at next probs for debugging

In [95]:
model.eval()
test_nll, test_logits = model.get_nll_and_logits(test_mol)
next_probs = test_logits.softmax(-1)[:, -1]
import matplotlib.pyplot as plt

plt.bar(torch.arange(1226), next_probs.cpu().squeeze(0))
