# Loading libraries

In [None]:
import torch
import pandas as pd
import numpy as np

# Preparing the training data

In [2]:
train=pd.read_csv("dataset.csv",header=None)

In [3]:
from tqdm.auto import tqdm      #Dividing the data into small txt files

text_data = []
file_count = 0

for sample in tqdm(train[0].astype(str)):
    text_data.append(sample)
    if len(text_data) == 10_000:
        # once we git the 10K mark, save to file
        with open(f'data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

with open(f'data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))

  0%|          | 0/89847 [00:00<?, ?it/s]

# Training the tokenizer from scratch

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("data").glob("*")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, min_frequency=2, special_tokens=["<s>", "<pad>","</s>","<unk>","<mask>",])

# Save files to disk
tokenizer.save_model("./model")

# Testing the tokenizer

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer("./model/vocab.json","./model/merges.txt",)

tokenizer._tokenizer.post_processor = BertProcessing(("</s>", tokenizer.token_to_id("</s>")),("<s>", tokenizer.token_to_id("<s>")))

tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("slm sahbi").tokens)

['<s>', 'slm', 'Ġsahbi', '</s>']


In [6]:
from transformers import RobertaTokenizer

# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = RobertaTokenizer.from_pretrained('model', max_len=512)

file model\config.json not found


# Creating the needed classes

In [7]:
from torch.utils.data import Dataset

class dariBERToDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./models/EsperBERTo-small/vocab.json",
            "./models/EsperBERTo-small/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path("./data/").glob("*-train.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

# Preparing the batch, labels, masks and inputs for the training

In [8]:
batch = tokenizer(list(train[0].astype(str)), max_length=512, padding='max_length', truncation=True)

In [9]:
import torch

labels = torch.tensor([x for x in dict(batch)["input_ids"]])
mask = torch.tensor([ x for x in dict(batch)["attention_mask"]])

In [10]:
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()

# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)

# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)

# loop through each row in input_ids tensor (cannot do in parallel)
for i in range(input_ids.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    input_ids[i, selection] = 3  # our custom [MASK] token == 3

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:882.)
  selection = torch.flatten(mask_arr[i].nonzero()).tolist()


In [11]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [12]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
dataset = Dataset(encodings)

In [13]:
loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)

# Initialising a Roberta model with parameters

In [1]:
from transformers import RobertaConfig

config = RobertaConfig(
    #vocab_size=10_000,  # we align this to the tokenizer vocab_size
    max_position_embeddings=512,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [15]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

# Moving the package to the GPU if present

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

# Activating the training mode and optimization

In [17]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

# Finally ... the training (1h20mn per epoch using a 6gb rtx 3060 and about 100000 rows of data)

In [18]:
from tqdm.auto import tqdm

epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # process
        outputs= model(input_ids, attention_mask=attention_mask,labels=labels)
        # extract loss
        loss = outputs[0]
        
        # calculate loss for every parameter that needs grad update
        loss.backward()
        
        # update parameters
        optim.step()
        
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/44924 [00:00<?, ?it/s]

  0%|          | 0/44924 [00:00<?, ?it/s]

  0%|          | 0/44924 [00:00<?, ?it/s]

# Save the model and parameters

In [39]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'loss': loss,
            }, 'model.pt')

# Load the model and tokenizer to continue training

In [7]:
from transformers import RobertaForMaskedLM
from transformers import AdamW
import torch

checkpoint = torch.load('model.pt',map_location=torch.device('cpu'))


model = RobertaForMaskedLM(config)
model.load_state_dict(checkpoint['model_state_dict'])
model.train()   #this activates the training mode

optim = AdamW(model.parameters(), lr=1e-4)
optim.load_state_dict(checkpoint['optimizer_state_dict'])

epoch = checkpoint['epoch']
loss = checkpoint['loss']

In [11]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('model', max_len=512)

file model\config.json not found


# Preparing the model for prediction

In [12]:
from transformers import pipeline

modelz = model.cpu()
fill_mask = pipeline("fill-mask", model=modelz, tokenizer=tokenizer)

In [13]:
result = fill_mask("wesh bik ya <mask>")
print(result)

[{'sequence': 'wesh bik ya', 'score': 0.9851618409156799, 'token': 1, 'token_str': '<pad>'}, {'sequence': 'wesh bik ya', 'score': 0.0018590508261695504, 'token': 0, 'token_str': '<s>'}, {'sequence': 'wesh bik ya', 'score': 0.0017879877705127, 'token': 2, 'token_str': '</s>'}, {'sequence': 'wesh bik ya ya', 'score': 0.00020806446264032274, 'token': 285, 'token_str': ' ya'}, {'sequence': 'wesh bik yarabi', 'score': 0.00012213866284582764, 'token': 340, 'token_str': 'rabi'}]
