# Train a RoBERTa model

In [1]:
import sys

sys.path.append("..")

In [2]:
import sqlite3

import torch
from tokenizers import ByteLevelBPETokenizer
from torch import nn
from tqdm import tqdm
from transformers import (
    DataCollatorWithPadding,
    RobertaConfig,
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    Trainer,
    TrainingArguments,
)

from adna.pylib import consts
from adna.pylib.datasets import ADnaDataset

In [3]:
TRAIN_EPOCHS = 20
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
SUMMARY_LEN = 7

In [4]:
DEVICE = 'cuda' if torch.has_cuda else 'cpu'

## Build the tokenizer

In [5]:
tokenizer_path = str(consts.SUB_DIR)
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

## Build the training datasets

In [6]:
train_dataset = ADnaDataset(
    "train", tokenizer, rev_comp_rate=consts.REV_COMP_RATE, to_n_rate=consts.TO_N_RATE
)
eval_dataset = ADnaDataset("val", tokenizer)

In [7]:
train_dataset[1]

{'input_ids': [0, 39, 312, 351, 617, 3186, 261, 346, 2994, 279, 527, 638, 370, 903, 2793, 1130, 296, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': tensor(1)}

In [8]:
len(train_dataset)

287233

## Build the model

In [9]:
config = RobertaConfig(
    vocab_size=consts.VOCAB_SIZE,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [10]:
model = RobertaForSequenceClassification(config=config)

In [11]:
model.num_parameters()

46660610

In [12]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=consts.MAX_LENGTH,
)

## Create a trainer that handles class imbalance

In [13]:
weights = train_dataset.weights()
weights = torch.tensor(weights).to(DEVICE)
LOSS_FN = nn.CrossEntropyLoss(weight=weights)

In [14]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = LOSS_FN(
            logits.view(-1, self.model.config.num_labels), labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

In [15]:
training_args = TrainingArguments(
    output_dir=consts.SUB_DIR / "models",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_strategy="epoch",
    seed=23,
)

In [16]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 287233
  Num Epochs = 20
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 44900


Epoch,Training Loss,Validation Loss
1,0.203,0.193107
2,0.1681,0.313494
3,0.2354,0.182642
4,0.1437,0.190099
5,0.1372,0.159235
6,0.1307,0.210067
7,0.1327,0.271269
8,0.1256,0.201695
9,0.1193,0.251597
10,0.1121,0.203365


***** Running Evaluation *****
  Num examples = 95744
  Batch size = 128
Saving model checkpoint to ../data/UF46992/models/checkpoint-2245
Configuration saved in ../data/UF46992/models/checkpoint-2245/config.json
Model weights saved in ../data/UF46992/models/checkpoint-2245/pytorch_model.bin
tokenizer config file saved in ../data/UF46992/models/checkpoint-2245/tokenizer_config.json
Special tokens file saved in ../data/UF46992/models/checkpoint-2245/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 95744
  Batch size = 128
Saving model checkpoint to ../data/UF46992/models/checkpoint-4490
Configuration saved in ../data/UF46992/models/checkpoint-4490/config.json
Model weights saved in ../data/UF46992/models/checkpoint-4490/pytorch_model.bin
tokenizer config file saved in ../data/UF46992/models/checkpoint-4490/tokenizer_config.json
Special tokens file saved in ../data/UF46992/models/checkpoint-4490/special_tokens_map.json
***** Running Evaluation *****
  Num examples 

TrainOutput(global_step=44900, training_loss=0.12180995303963234, metrics={'train_runtime': 33989.284, 'train_samples_per_second': 169.014, 'train_steps_per_second': 1.321, 'total_flos': 1.189031509942464e+17, 'train_loss': 0.12180995303963234, 'epoch': 20.0})

### It looks like the model isn't training

It looks like epoch 5 is the best so far. Path = `../data/UF46992/models/checkpoint-11225`
After that we start to overfit.