## Finetuning BETO

In this notebook, we will check what happens if we fine tune using MLM on the TASS tweets

In [1]:
import os
from glob import glob
import pandas as pd

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"


In [2]:
import torch
from transformers import BertForMaskedLM, BertTokenizerFast

model_name = 'dccuchile/bert-base-spanish-wwm-uncased'

device = "cuda" if torch.cuda.is_available() else "cpu"

model = BertForMaskedLM.from_pretrained(model_name, return_dict=True, num_labels=3)
model = model.to(device)
tokenizer = BertTokenizerFast.from_pretrained(model_name)
tokenizer.model_max_length = 128

In [3]:
from glob import glob

num_files = 100
tweet_files = glob("../data/spanish_tweets/*.txt")

tweet_files = tweet_files[:num_files]
num_dev_files = 10
train_files = tweet_files[:-num_dev_files]
dev_files = tweet_files[-num_dev_files:]

In [4]:
%%time
from datasets import load_dataset, Features, Value


features = Features({
    'text': Value('string'),
})


train_dataset, test_dataset = load_dataset(
    "text", data_files={"train": train_files, "test": dev_files}, split=["train", "test"], features=features
)


Using custom data configuration default-51b94953e03ae680
Reusing dataset text (/home/jmperez/.cache/huggingface/datasets/text/default-51b94953e03ae680/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


CPU times: user 164 ms, sys: 44 ms, total: 208 ms
Wall time: 885 ms


In [5]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

batch_size = 32

#train_dataset.set_transform(tokenize)
#test_dataset.set_transform(tokenize)
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=batch_size)


HBox(children=(FloatProgress(value=0.0, max=4108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=557.0), HTML(value='')))




What about lengths of tweets?

128 should do the trick

In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [8]:
from transformers import Trainer, TrainingArguments

model_name = "TwiBETO-general"

model_path = f"./{model_name}"

eval_steps = 200

lr = 1e-3

training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    learning_rate=lr,
    warmup_ratio=0.05,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4096 / batch_size,
    save_steps=eval_steps,
    logging_steps=eval_steps,
    do_eval= True,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [9]:
len(train_dataset)

131444

In [10]:
%%time
trainer.train()



Step,Training Loss,Validation Loss


In [None]:
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

## Checking mask


In [None]:
from transformers import pipeline

model_name = 'dccuchile/bert-base-spanish-wwm-cased'

fill_mask_beto = pipeline(
    "fill-mask",
    model=model_name,
    tokenizer=model_name
)

fill_mask_twibeto = pipeline(
    "fill-mask",
    model="../models/TwiBETO-general/",
    tokenizer=model_name
)

In [None]:
phrase = "la capital de Rusia es [MASK]"

targets = None
fill_mask_twibeto(phrase, targets=targets)

In [None]:
fill_mask_beto(phrase, targets=targets)

In [None]:
fill_mask_beto?