In [1]:
import random
from collections import defaultdict
from functools import partial

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
    AutoModel,
    AutoTokenizer,
    DataCollatorWithPadding,
    get_constant_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)

import wandb
from config import TrainArgs
from data import (
    MultiContrastiveDS,
    load_kasa_regression,
    multiple_contrastive_collate,
    tokenize,
)
from model import MultiContrastiveModel

wandb.init(project="delta", name="init_v10")
args = TrainArgs(num_dataloader_workers=0)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrahul-e-dev[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
ds = load_kasa_regression()
n_train = len(ds["train"])

rand_idxs = [
    random.randint(0, n_train - 1)
    for _ in range(int(args.train_undersample_ratio * n_train))
]

ds["train"] = ds["train"].select(rand_idxs)

tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
tok_func = partial(tokenize, tokenizer=tokenizer)
ds = ds.map(tok_func, num_proc=8).remove_columns(["smiles", "inchi"])

tokenizer.pad_token = tokenizer.eos_token
padding_collator = DataCollatorWithPadding(tokenizer)

c:\Users\rahul\mambaforge\envs\bio\lib\site-packages\astartes\samplers\extrapolation\scaffold.py:44: NoMatchingScaffold: No matching scaffold was found for the 3 molecules corresponding to indices {23234, 36709, 11879}


Map (num_proc=8):   0%|          | 0/3977 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4972 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4972 [00:00<?, ? examples/s]

In [3]:
from train import MultiContrastiveTrainer, initialize_train_dataloader

model = MultiContrastiveModel(
    AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
).cuda()

train_dl = initialize_train_dataloader(ds['train'], model, padding_collator, args)

optimizer = AdamW(model.parameters(), lr=args.lr)
n_epochs = args.n_epochs
n_steps = len(train_dl) * n_epochs
warmup_ratio = args.warmup_ratio
scheduler = get_cosine_schedule_with_warmup(
    optimizer, n_steps * warmup_ratio, n_steps * (1 - warmup_ratio)
)

trainer = MultiContrastiveTrainer(model, optimizer, scheduler, train_dl, ds['test'], args)

Some weights of the model checkpoint at DeepChem/ChemBERTa-77M-MTR were not used when initializing RobertaModel: ['norm_mean', 'regression.out_proj.bias', 'norm_std', 'regression.dense.bias', 'regression.dense.weight', 'regression.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions an

In [4]:
trainer.train()

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Train:   0%|          | 0/994 [00:00<?, ?it/s]

Train:   0%|          | 0/994 [00:00<?, ?it/s]