In [None]:
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

Found existing installation: tensorflow 2.5.0
Uninstalling tensorflow-2.5.0:
  Successfully uninstalled tensorflow-2.5.0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-y7hckfty
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-y7hckfty
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 10.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.6 MB/

In [None]:
%%time
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = "./CorpusUnidosQU.txt"

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 46.1 s, sys: 1.02 s, total: 47.1 s
Wall time: 24.6 s


In [None]:
!mkdir quechuaBERT
tokenizer.save_model("quechuaBERT")

['quechuaBERT/vocab.json', 'quechuaBERT/merges.txt']

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./quechuaBERT/vocab.json",
    "./quechuaBERT/merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
tokenizer.encode("allinllachu manan allinlla huk wasipita").tokens

['<s>',
 'allin',
 'llachu',
 'Ġmanan',
 'Ġallinlla',
 'Ġhuk',
 'Ġwasipi',
 'ta',
 '</s>']

In [None]:
# Check that we have a GPU
!nvidia-smi

Fri Aug  6 04:24:45 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
import json
tokenizer_config = {"max_len": 512}

with open("./quechuaBERT/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./quechuaBERT", max_len=512)

file ./quechuaBERT/config.json not found
file ./quechuaBERT/config.json not found


In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()
# => 84 million parameters

83504416

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./CorpusUnidosQU.txt",
    block_size=128,
)



CPU times: user 18.5 s, sys: 529 ms, total: 19 s
Wall time: 12.2 s


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./quechuaBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 202979
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3172
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,8.1229
1000,7.2216
1500,6.8542
2000,6.5846
2500,6.4291
3000,6.3195




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 49min 7s, sys: 37.6 s, total: 49min 45s
Wall time: 49min 55s


TrainOutput(global_step=3172, training_loss=6.884871108835612, metrics={'train_runtime': 2995.2851, 'train_samples_per_second': 67.766, 'train_steps_per_second': 1.059, 'total_flos': 4785535231586304.0, 'train_loss': 6.884871108835612, 'epoch': 1.0})

In [None]:
trainer.state.log_history[-2]

{'epoch': 0.95,
 'learning_rate': 2.711223203026482e-06,
 'loss': 6.3195,
 'step': 3000}

In [None]:
trainer.save_model("./quechuaBERT")

Saving model checkpoint to ./quechuaBERT
Configuration saved in ./quechuaBERT/config.json
Model weights saved in ./quechuaBERT/pytorch_model.bin


In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./quechuaBERT",
    tokenizer="./quechuaBERT"
)

loading configuration file ./quechuaBERT/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./quechuaBERT/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,

In [None]:
fill_mask("allinllachu <mask> allinlla huk wasipita.")

[{'score': 0.23992203176021576,
  'sequence': 'allinllachu nisqaqa allinlla huk wasipita.',
  'token': 334,
  'token_str': ' nisqaqa'},
 {'score': 0.061005301773548126,
  'sequence': 'allinllachu, allinlla huk wasipita.',
  'token': 16,
  'token_str': ','},
 {'score': 0.028720015659928322,
  'sequence': "allinllachu' allinlla huk wasipita.",
  'token': 11,
  'token_str': "'"},
 {'score': 0.012927944771945477,
  'sequence': 'allinllachu kay allinlla huk wasipita.',
  'token': 377,
  'token_str': ' kay'},
 {'score': 0.01230092253535986,
  'sequence': 'allinllachu. allinlla huk wasipita.',
  'token': 18,
  'token_str': '.'}]

In [None]:
#Runap wiñarquypa puriyninmantam rikuchin.
#Presenta el grado del desarrollo humano .
#<mask>
fill_mask("<mask> wiñarquypa puriyninmantam rikuchin.")

[{'score': 0.04872283339500427,
  'sequence': 'Kay wiñarquypa puriyninmantam rikuchin.',
  'token': 664,
  'token_str': 'Kay'},
 {'score': 0.009029297158122063,
  'sequence': 'Chay wiñarquypa puriyninmantam rikuchin.',
  'token': 920,
  'token_str': 'Chay'},
 {'score': 0.007983014918863773,
  'sequence': 'San wiñarquypa puriyninmantam rikuchin.',
  'token': 817,
  'token_str': 'San'},
 {'score': 0.007923364639282227,
  'sequence': 'Huk wiñarquypa puriyninmantam rikuchin.',
  'token': 1267,
  'token_str': 'Huk'},
 {'score': 0.006985992658883333,
  'sequence': 'Kamasqa wiñarquypa puriyninmantam rikuchin.',
  'token': 1204,
  'token_str': 'Kamasqa'}]

In [None]:
# Conexion a Google Colaborative
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive
