# Train a RoBERTa model

In [1]:
import sys

sys.path.append('..')

In [2]:
import sqlite3

import pandas as pd
import torch
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import (
    DataCollatorWithPadding,
    RobertaConfig,
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    Trainer,
    TrainingArguments,
)

from adna.pylib import consts

In [3]:
TRAIN_EPOCHS = 20
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
SUMMARY_LEN = 7

## Build the tokenizer

In [4]:
tokenizer_path = str(consts.SUB_DIR)
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

## Build the training datasets

In [5]:
def get_split(split):
    with sqlite3.connect(consts.SQL) as cxn:
        sql = 'select * from seqs where split = ?'  # limit 10000'
        df = pd.read_sql(sql, cxn, params=[split])
    return df

In [6]:
def get_dataset(df, tokenizer):
    encoded = []
    for _, row in df.iterrows():
        data = tokenizer.encode_plus(
            row.seq,
            # return_tensors='pt',
            padding='max_length',
            max_length=consts.MAX_LENGTH,
        )
        data['label'] = torch.tensor(row.label)
        encoded.append(data)
    return encoded

In [7]:
train_seqs = get_split('train')
eval_seqs = get_split('val')

In [8]:
train_dataset = get_dataset(train_seqs, tokenizer)
eval_dataset = get_dataset(eval_seqs, tokenizer)
train_dataset[1]

{'input_ids': [0, 261, 3750, 3180, 276, 2906, 3553, 713, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': tensor(0)}

## Build the model

In [9]:
config = RobertaConfig(
    vocab_size=consts.VOCAB_SIZE,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [10]:
model = RobertaForSequenceClassification(config=config)

In [11]:
model.num_parameters()

46660610

In [12]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding='max_length',
    max_length=consts.MAX_LENGTH,
)

In [13]:
training_args = TrainingArguments(
    output_dir=consts.SUB_DIR,
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    save_steps=8192,
    # save_total_limit=1,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 2
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 80


RuntimeError: CUDA out of memory. Tried to allocate 240.00 MiB (GPU 0; 7.80 GiB total capacity; 5.61 GiB already allocated; 196.44 MiB free; 5.67 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF