# Train a RoBERTa model

In [1]:
import sys

sys.path.append('..')

In [2]:
import sqlite3

import pandas as pd
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import (
    DataCollatorForLanguageModeling,
    RobertaConfig,
    RobertaForMaskedLM,
    Trainer,
    TrainingArguments,
)

from adna.pylib import consts

## Build the tokenizer

In [3]:
tokenizer = ByteLevelBPETokenizer(
    str(consts.SUB_DIR / "vocab.json"),
    str(consts.SUB_DIR / "merges.txt"),
    lowercase=True,
)

In [4]:
tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

## Build the training datasets

In [5]:
def get_split(split):
    with sqlite3.connect(consts.SQL) as cxn:
        sql = 'select * from seqs where split = ?'
        df = pd.read_sql(sql, cxn, params=[split])
    return df.seq.tolist()

In [6]:
class ADnaDataset(Dataset):
    step = 1024

    def __init__(self, seqs, tokenizer):
        self.input_ids = []

        for i in tqdm(range(0, len(seqs), self.step)):
            batch = tokenizer.encode_batch(seqs[i:i+self.step])
            for tokens in batch:
                self.input_ids.append(tokens.ids)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx])

In [7]:
train_seqs = get_split('train')
eval_seqs = get_split('val')

In [8]:
train_dataset = ADnaDataset(train_seqs, tokenizer)
eval_dataset = ADnaDataset(eval_seqs, tokenizer)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6444/6444 [01:05<00:00, 97.91it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2148/2148 [00:21<00:00, 98.40it/s]


## Build the model

In [9]:
TRAIN_EPOCHS = 10
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
SUMMARY_LEN = 7

In [10]:
config = RobertaConfig(
    vocab_size=consts.VOCAB_SIZE,
    max_position_embeddings=consts.SEQ_LENGTH,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [11]:
model = RobertaForMaskedLM(config=config)

In [12]:
model.num_parameters()

47028104

In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15,
)

AttributeError: 'ByteLevelBPETokenizer' object has no attribute 'mask_token'

In [None]:
training_args = TrainingArguments(
    output_dir=consts.SUB_DIR,
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    save_total_limit=1,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
trainer.train()