In [1]:
import os
import torch
import random
from transformers import RobertaConfig, RobertaForMaskedLM
from torch.utils.data import DataLoader, TensorDataset, random_split
import time
from datetime import timedelta

os.environ["TOKENIZERS_PARALLELISM"] = "false"
np.set_printoptions(threshold=2000)
torch.set_printoptions(threshold=2000)
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
print(torch.cuda.is_available())

True


In [3]:
tds = torch.load("data/tensor_dataset_12M.pth")

In [4]:
tds[0]

(tensor([   0,    4,  142, 1951, 1394,  877, 5032, 2466, 2762, 3065,    2,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([-100,   16, -100, -100, -100, -100, -100, 

In [5]:
# Define a custom data collator
class CustomDataCollator:
    def __call__(self, features):
        input_ids = torch.stack([f[0] for f in features])
        attention_mask = torch.stack([f[1] for f in features])
        labels = torch.stack([f[2] for f in features])
        return {"input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels}

# Initialize the custom data collator
custom_dc = CustomDataCollator()

In [6]:
print(len(tds))

12710183


In [7]:
# Define the ratio for train/test split
train_size = int(0.8 * len(tds))
eval_size = len(tds) - train_size

# Split the dataset
train_dataset, eval_dataset = random_split(tds, [train_size, eval_size])
print(len(train_dataset), len(eval_dataset))

10168146 2542037


In [8]:
EMBED_SIZE = 128
model_out_dir = f"./mlm_checkpoints/CoV-RoBERTa_{EMBED_SIZE}"
if not os.path.exists(model_out_dir):
    os.makedirs(model_out_dir)

In [9]:
# https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaConfig

config = RobertaConfig(
    vocab_size=10000, # defaults to 50265
    hidden_size=768, # defaults to 768
    max_position_embeddings=EMBED_SIZE, # defaults to 512
    num_attention_heads=12, # defaults to 12
    num_hidden_layers=6, # defaults to 12
    type_vocab_size=1 # defaults to 2
)

In [10]:
model = RobertaForMaskedLM(config=config)

In [11]:
print(model.num_parameters())

50909968


In [12]:
from transformers import Trainer, TrainingArguments

# https://huggingface.co/docs/transformers/v4.35.0/en/main_classes/trainer#transformers.TrainingArguments

training_args = TrainingArguments(
    report_to = 'tensorboard',
    optim='adamw_torch',
    output_dir=model_out_dir,
    evaluation_strategy = 'steps', 
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=848, # 10168246/848 = 11991 steps  # 43 Gb GPU Memory
    per_device_eval_batch_size=848,  # 2542062/848  = 2998  steps
    save_steps=1000,
    save_total_limit=2,
    logging_steps=1000,
    prediction_loss_only=True,
    push_to_hub=False,
    seed=42,
    data_seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=custom_dc,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [13]:
start_time = time.time()

trainer.train()

elapsed_time = time.time() - start_time
formatted_time = str(timedelta(seconds=elapsed_time))

print(f"Elapsed time: {formatted_time}")

Step,Training Loss,Validation Loss
1000,0.8462,0.045501
2000,0.0441,0.031457
3000,0.0329,0.02657
4000,0.0282,0.024655
5000,0.0261,0.022686
6000,0.0242,0.021051
7000,0.0226,0.020579
8000,0.0209,0.019163
9000,0.0202,0.01814
10000,0.0192,0.017714


Elapsed time: 8:48:42.985185


In [14]:
trainer.evaluate()

{'eval_loss': 0.016910668462514877,
 'eval_runtime': 1437.048,
 'eval_samples_per_second': 1768.93,
 'eval_steps_per_second': 2.086,
 'epoch': 1.0}

In [15]:
trainer.save_model()