In [31]:
import numpy as np
import pandas as pd
import torch
import seaborn as sns

from chemberta.utils.molnet_dataloader import load_molnet_dataset
from datasets import load_metric
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, RobertaConfig, Trainer, TrainingArguments

from chemberta.utils.roberta_regression import RobertaForRegression

In [12]:
# tasks, (train_df, valid_df, test_df), transformers = load_molnet_dataset("tox21", tasks_wanted=['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'], df_format="chemprop")

tasks, (train_df, valid_df, test_df), transformers = load_molnet_dataset("delaney", split="scaffold", df_format="chemprop")

'split' is deprecated.  Use 'splitter' instead.


Using tasks ['measured log solubility in mols per litre'] from available tasks for delaney: ['measured log solubility in mols per litre']


In [13]:
train_df

Unnamed: 0,smiles,measured log solubility in mols per litre
0,CC(C)=CCCC(C)=CC=O,0.390413
1,C=CCCC,0.090421
2,CCCCCCCCCCCCCC,-2.464346
3,CC(C)Cl,0.704920
4,CCC(C)CO,1.159746
...,...,...
897,CC(=O)OCC(=O)C1(O)CCC2C3CCC4=CC(=O)CCC4(C)C3C(...,-0.649881
898,c1ccc2nc3ccccc3cc2c1,-0.388598
899,Nc1cccc2nc3ccccc3cc12,-0.654719
900,C1CCCCCC1,-0.311180


In [14]:
tokenizer = RobertaTokenizerFast.from_pretrained('seyonec/SMILES_tokenized_PubChem_shard00_160k', max_len=512)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [15]:
train_encodings = tokenizer(train_df["smiles"].tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_df["smiles"].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_df["smiles"].tolist(), truncation=True, padding=True)

In [16]:
len(train_encodings["input_ids"])

902

In [17]:
config = RobertaConfig.from_pretrained("/home/ubuntu/chemberta_models/mlm/sm_015/")
config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_gpu": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 515,
  "model_type": "roberta",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 600
}

In [18]:
train_labels = train_df.iloc[:, 1:].values.flatten()
valid_labels = valid_df.iloc[:, 1:].values.flatten()
test_labels = test_df.iloc[:, 1:].values.flatten()

In [19]:
class MolNetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MolNetDataset(train_encodings, train_labels)
valid_dataset = MolNetDataset(valid_encodings, valid_labels)
test_dataset = MolNetDataset(test_encodings, test_labels)

In [20]:
train_labels.shape

(902,)

In [21]:
config.num_labels = 1
config.norm_mean = [np.mean(np.array(train_labels), axis=0)]
config.norm_std = [np.std(np.array(train_labels), axis=0)]

In [40]:
model = RobertaForRegression.from_pretrained("/home/ubuntu/chemberta_models/mlm/sm_015/", config=config)

Some weights of the model checkpoint at /home/ubuntu/chemberta_models/mlm/sm_015/ were not used when initializing RobertaForRegression: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForRegression were not initialized from the model checkpoint at /home/ubuntu/chemberta_models/mlm/sm_015/ and are newly initialized: ['norm_mean', 'norm_std', 'regression.dense.weight', 'regression

In [41]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [49]:
metric = load_metric("pearsonr")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    return metric.compute(predictions=logits.reshape(-1, 1), references=labels.reshape(-1, 1))

Couldn't find file locally at pearsonr/pearsonr.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.6.2/metrics/pearsonr/pearsonr.py.
The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/metrics/pearsonr/pearsonr.py.


In [58]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,             # evaluation dataset
#     compute_metrics=compute_metrics,
)

In [44]:
trainer.train()

Step,Training Loss
10,0.6453


TrainOutput(global_step=15, training_loss=0.5636265595753988, metrics={'train_runtime': 35.8605, 'train_samples_per_second': 0.418, 'total_flos': 8372078089452.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 467853312, 'train_mem_cpu_peaked_delta': 1497989120})

In [76]:
predictions = trainer.predict(valid_dataset)

In [77]:
from scipy.stats import pearsonr

In [78]:
pearsonr(predictions.predictions.flatten(), predictions.label_ids.flatten())

(0.5578841183758808, 1.369558395877492e-10)