In [1]:
import sys

sys.path.append("..")

In [2]:
from transformers import (
    DataCollatorWithPadding,
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    TrainingArguments,
)

from adna.pylib import consts
from adna.pylib.bpe_dataset import BPEDataset
from adna.pylib.weighted_trainer import WeightedTrainer

## The model to fine tune

In [3]:
MODEL_PATH = consts.MT_DIR / "train" / "checkpoint-65868"

## Data augmentation parameters

In [4]:
REV_COMP_RATE = 0.5
TO_N_RATE = 0.02

## Training parameters

In [5]:
TRAIN_EPOCHS = 50
LEARNING_RATE = 3e-5
TRAIN_BATCH_SIZE = 192
EVAL_BATCH_SIZE = 192

MODEL_DIR = "finetune"  # Save check points to this sub-directory

## Get the tokenizer

In [6]:
tokenizer_path = str(consts.MT_DIR)
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

## Get the datasets

In [7]:
train_dataset = BPEDataset(
    "train",
    tokenizer,
    rev_comp_rate=REV_COMP_RATE,
    to_n_rate=TO_N_RATE,
)
eval_dataset = BPEDataset("val", tokenizer)

## Adjust weights

In [8]:
WEIGHTS = train_dataset.weights
print(WEIGHTS)
WEIGHTS[0] = 1.0
WEIGHTS[1] = 2.0
WEIGHTS

[0.60151492 2.96269211]


array([1., 3.])

## Get the model

In [9]:
model = RobertaForSequenceClassification.from_pretrained(
    MODEL_PATH, local_files_only=True
)

## Build the data collator

In [10]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=consts.MAX_LENGTH,
)

## Build the trainer

In [11]:
training_args = TrainingArguments(
    output_dir=consts.MT_DIR / MODEL_DIR,
    overwrite_output_dir=True,
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    seed=23,
)

In [12]:
trainer = WeightedTrainer(
    WEIGHTS,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

## Train

In [13]:
trainer.train()

***** Running training *****
  Num examples = 287233
  Num Epochs = 50
  Instantaneous batch size per device = 192
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 74850


Epoch,Training Loss,Validation Loss
1,0.1719,0.159248
2,0.1702,0.16212
3,0.1698,0.16504
4,0.1692,0.167536
5,0.1679,0.159004
6,0.1659,0.164339
7,0.1657,0.159902
8,0.163,0.15794
9,0.1633,0.160411
10,0.1614,0.160616


***** Running Evaluation *****
  Num examples = 95744
  Batch size = 192
Saving model checkpoint to ../data/UF46992/finetune2/checkpoint-1497
Configuration saved in ../data/UF46992/finetune2/checkpoint-1497/config.json
Model weights saved in ../data/UF46992/finetune2/checkpoint-1497/pytorch_model.bin
tokenizer config file saved in ../data/UF46992/finetune2/checkpoint-1497/tokenizer_config.json
Special tokens file saved in ../data/UF46992/finetune2/checkpoint-1497/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 95744
  Batch size = 192
Saving model checkpoint to ../data/UF46992/finetune2/checkpoint-2994
Configuration saved in ../data/UF46992/finetune2/checkpoint-2994/config.json
Model weights saved in ../data/UF46992/finetune2/checkpoint-2994/pytorch_model.bin
tokenizer config file saved in ../data/UF46992/finetune2/checkpoint-2994/tokenizer_config.json
Special tokens file saved in ../data/UF46992/finetune2/checkpoint-2994/special_tokens_map.json
***** Running Ev

TrainOutput(global_step=74850, training_loss=0.14911928960459983, metrics={'train_runtime': 36307.6046, 'train_samples_per_second': 395.555, 'train_steps_per_second': 2.062, 'total_flos': 6.91879669575e+16, 'train_loss': 0.14911928960459983, 'epoch': 50.0})

### It looks like epoch 46 is best

Path = ../data/UF46992/finetune2/checkpoint-68862
