In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

sys.path.append(f"{os.getcwd()}/../")

In [2]:
from pathlib import Path
import pandas as pd
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

vocab_size = 52_000

In [3]:
paths = [str(x) for x in Path(f"{os.getcwd()}/../data/embedding").glob("**/corpus.txt")]
print(paths)

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

['/ssd/programas/WordEmbeddingPortugues/notebooks/../data/embedding/corpus.txt']


In [4]:
!mkdir BR_BERTo
# Save files to disk
tokenizer.save_model("BR_BERTo")

mkdir: não foi possível criar o diretório “BR_BERTo”: Arquivo existe


['BR_BERTo/vocab.json', 'BR_BERTo/merges.txt']

In [5]:
tokenizer = ByteLevelBPETokenizer(
    "./BR_BERTo/vocab.json",
    "./BR_BERTo/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("gostei muito dessa ideia".lower()).tokens)

['<s>', 'gostei', 'Ġmuito', 'Ġdessa', 'Ġideia', '</s>']


In [6]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [7]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [8]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [9]:
model.num_parameters()

84095008

In [10]:
# from nlp import load_dataset

# dataset = load_dataset("text", data_files=[f"{os.getcwd()}/../data/embedding/corpus.txt"])

In [11]:
# len(dataset["train"])
# dataset["train"][0]["text"]

In [12]:
# dt = pd.read_csv(f"{os.getcwd()}/../data/embedding/colab.txt0001", header=None, chunksize=1)
# len(dt)
# next(dt).to_numpy()[0][0]
# next(dt).to_numpy()[0][0]

In [13]:
from nlp import load_dataset
from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset

def stream(file_path):
    with open(file_path, encoding="utf-8") as fh:
        for line in fh.readlines():
            yield line.strip()

class EsperantoDataset(Dataset):
    def __init__(self, tokenizer, file_path: str):
        self.tokenizer = tokenizer
        self.file_path = file_path
        self.dataset = pd.read_csv(self.file_path, header=None, iterator=True)

    def __len__(self):
        return 5258624 # fazer um wc -l para ver a qtde de linhas

    def preprocess(self, text):
        batch_encoding = self.tokenizer(str(text).strip(), add_special_tokens=True, truncation=True, max_length=64)
        return torch.tensor(batch_encoding["input_ids"])
    
    def __getitem__(self, i):
        example = self.preprocess(self.dataset.get_chunk(1).to_numpy()[0][0])
        return example

In [14]:
tokenizer = RobertaTokenizerFast.from_pretrained("./BR_BERTo", max_len=512)
dataset = EsperantoDataset(
    tokenizer=tokenizer,
    file_path=f"{os.getcwd()}/../data/embedding/corpus.txt"
)

# from transformers import LineByLineTextDataset

# dataset = LineByLineTextDataset(
#     tokenizer=tokenizer,
#     file_path=f"{os.getcwd()}/../data/embedding/colab.txt0001",
#     block_size=32,
# )

In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./BR_BERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=164332.0, style=ProgressStyle(description…

In [None]:
trainer.save_model("./BR_BERTo")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./BR_BERTo",
    tokenizer="./BR_BERTo", topk=7
)

In [None]:
fill_mask("eu gosto muito de <mask>")