<a href="https://colab.research.google.com/github/rjzevallos/bert-quechua/blob/main/notebooks/LlamaRoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-6dx6ukdd
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-6dx6ukdd
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
tokenizers                    0.10.3
transformers                  4.13.0.dev0


In [41]:
!git clone https://github.com/Llamacha/bert-quechua/

fatal: destination path 'bert-quechua' already exists and is not an empty directory.


In [42]:
%%time
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

#paths = "./cc100-quechua.txt"
paths = [str(x) for x in Path("./bert-quechua/data/").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 50 s, sys: 1.19 s, total: 51.2 s
Wall time: 27 s


In [43]:
!mkdir tokenization
tokenizer.save_model("tokenization")

mkdir: cannot create directory ‘tokenization’: File exists


['tokenization/vocab.json', 'tokenization/merges.txt']

In [44]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./tokenization/vocab.json",
    "./tokenization/merges.txt",
)

In [45]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [46]:
tokenizer.encode("allinllachu manan allinlla huk wasipita").tokens

['<s>',
 'allin',
 'llachu',
 'Ġmanan',
 'Ġallinlla',
 'Ġhuk',
 'Ġwasipi',
 'ta',
 '</s>']

In [47]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [48]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [49]:
import json
tokenizer_config = {"max_len": 512}

with open("./tokenization/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [50]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./tokenization", max_len=512)

file ./tokenization/config.json not found
file ./tokenization/config.json not found


In [51]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [52]:
model.num_parameters()

83504416

In [53]:
print(paths)
corpus = ""
for t in paths:
  with open(t) as datasets:
    corpus = corpus + "\n"+ datasets.read()

with open("./corpus.txt", 'w') as cp:
    cp.write(corpus)

['bert-quechua/data/focus_2007.txt', 'bert-quechua/data/fundacion_aypanankuna_2008.txt', 'bert-quechua/data/normatividad_ana_2013.txt', 'bert-quechua/data/unesco_2020.txt', 'bert-quechua/data/microsoft_2021.txt', 'bert-quechua/data/Lectura-favorita-quechua-cusco-2019.txt', 'bert-quechua/data/lecturas-favoritas-quechua-chanka-2019.txt', 'bert-quechua/data/nanotecnologia_2016.txt', 'bert-quechua/data/poder_jucial_peru.txt', 'bert-quechua/data/fondo_monetario_internacional_2010.txt', 'bert-quechua/data/tatoeba.txt', 'bert-quechua/data/oscar-quz.txt', 'bert-quechua/data/wikimedia.txt', 'bert-quechua/data/camara_comercio_2008.txt', 'bert-quechua/data/tierra_vive_religion.txt', 'bert-quechua/data/dw_2019.txt', 'bert-quechua/data/amerindia_1999.txt', 'bert-quechua/data/gregorio_condori_mamani.txt', 'bert-quechua/data/focus_2008.txt', 'bert-quechua/data/cosude2009-2011.txt', 'bert-quechua/data/cc100-quechua.txt', 'bert-quechua/data/acuerdo_nacional_2014.txt', 'bert-quechua/data/que_community_2

In [54]:

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./corpus.txt",
    block_size=128,
)



In [55]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [56]:
!mkdir LlamaRoBERTa

In [78]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./LlamaRoBERTa",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [79]:
import gc
gc.collect()

167

In [80]:
%%time
trainer.train()

***** Running training *****
  Num examples = 144666
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4521


Step,Training Loss
500,8.0965
1000,7.4746
1500,7.2378
2000,7.0047
2500,6.7444
3000,6.6439
3500,6.5326
4000,6.4497
4500,6.4522




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 55min 24s, sys: 24.9 s, total: 55min 49s
Wall time: 55min 45s


TrainOutput(global_step=4521, training_loss=6.957051061787612, metrics={'train_runtime': 3345.9186, 'train_samples_per_second': 43.237, 'train_steps_per_second': 1.351, 'total_flos': 2766187607838720.0, 'train_loss': 6.957051061787612, 'epoch': 1.0})

In [81]:
trainer.state.log_history[-2]

{'epoch': 1.0,
 'learning_rate': 2.3224950232249503e-07,
 'loss': 6.4522,
 'step': 4500}

In [84]:
trainer.save_model("./tokenization")

Saving model checkpoint to ./tokenization
Configuration saved in ./tokenization/config.json
Model weights saved in ./tokenization/pytorch_model.bin


In [85]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./LlamaRoBERTa",
    tokenizer="./tokenization"
)

loading configuration file ./LlamaRoBERTa/config.json
Model config RobertaConfig {
  "_name_or_path": "./LlamaRoBERTa",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.13.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./LlamaRoBERTa/config.json
Model config RobertaConfig {
  "_name_or_path": "./LlamaRoBERTa",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classi

In [87]:
fill_mask("Allinllachu <mask>")

[{'score': 0.391947478055954,
  'sequence': 'Allinllachu.',
  'token': 18,
  'token_str': '.'},
 {'score': 0.08064036071300507,
  'sequence': 'Allinllachu?',
  'token': 35,
  'token_str': '?'},
 {'score': 0.07059300690889359,
  'sequence': 'Allinllachu pruwinsya',
  'token': 416,
  'token_str': ' pruwinsya'},
 {'score': 0.02550419233739376,
  'sequence': 'Allinllachu:',
  'token': 30,
  'token_str': ':'},
 {'score': 0.022942623123526573,
  'sequence': 'Allinllachu,',
  'token': 16,
  'token_str': ','}]