In [11]:
# Impordid
import torch
from estnltk import Text
from estnltk.corpus_processing.parse_enc import parse_enc_file_iterator
from src.transformers import BertTokenizer, BertForMaskedLM, BertConfig, Trainer, TrainingArguments
import numpy as np
import os
from tqdm import tqdm

In [12]:
# Tokeniseerija
tokenizer = BertTokenizer(vocab_file = "vocab_final.txt", vocab_file_form = "vocab_form.txt")

In [13]:
%%time
# Korpus
# Loeme tekstid sisse, laseme estnltk-l laused leida ning moodustame lausetest treening- ja testhulg
# https://github.com/estnltk/estnltk/blob/main/tutorials/corpus_processing/importing_text_objects_from_corpora.ipynb

input_folder = "korpus"
korpus = []

for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)

    n = 1 # Mitu teksti korpusesse lugeda
    l = 0
    for text_obj in parse_enc_file_iterator(file_path):
        korpus.append(text_obj)
        if l > n:
            break
        l += 1

train =  korpus[:int(0.6*len(korpus))]
val =  korpus[int(0.6*len(korpus)):int(0.8*len(korpus))]
test = korpus[int(0.8*len(korpus)):]

train_laused = []
for tekst in train:
    for span in tekst.original_sentences:
        train_laused.append(tekst.text[span.start:span.end])

val_laused = []
for tekst in val:
    for span in tekst.original_sentences:
        val_laused.append(tekst.text[span.start:span.end])

test_laused = []
for tekst in test:
    for span in tekst.original_sentences:
        test_laused.append(tekst.text[span.start:span.end])

CPU times: total: 9.75 s
Wall time: 9.84 s


In [19]:
korpus

[Text(text='Tallinna linn algatab Paldiski maantee ääres Hotell Tallinna kõrval asuva suure vundamendiaugu ja tühermaa detailplaneeringu koostamise, ehitustööde alustamist takistavad aga ala segased omandisuhted. Piirkonnaarhitekt Alice Laanemägi ütles, et OÜ Maranello Vara on taotlenud linnalt selle ala detailplaneeringu koostamist, et ehitada sinna tulevikus Grand Hotel Tallinna laiendus, veepark ja parkla. Ehitisi planeeritakse kaks, korruseid 20 ja krundi täisehitamise protsendiks 80. Laanemägi lisas, et taotlus on küll olemas, kuid planeeringu koostamist ei ole veel jõutud alustada. OÜ Maranello Vara esindaja Olev Kasak ütles, et nad palusid küll linnal detailplaneering algatada, kuid võimalikust ehitamisest on veel ennatlik rääkida. Krundil on väga palju omanikke ja nende kõigiga tuleb ehitamise või maa ostmise üle läbi rääkida. Paraku ei ole firmal õnnestunud kõiki maaomanikke veel üles leida. AS Amerest Hotels kavatses sellele krundile 10 aastat tagasi ehitada Sheraton hotelli.

In [14]:
train_laused[0]

'Tallinna linn algatab Paldiski maantee ääres Hotell Tallinna kõrval asuva suure vundamendiaugu ja tühermaa detailplaneeringu koostamise, ehitustööde alustamist takistavad aga ala segased omandisuhted.'

In [15]:
# Dataseti loomine dataloaderi jaoks
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings["input_ids"].shape[0]
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [16]:
# Maskimine ja labelid
def mlm(tensor):
    labels = tensor.detach().clone()
    rand = torch.rand(tensor[:, :, 0].shape)
    mask_arr = (rand < 0.15) * (tensor[:, :, 0] > 5)
    for i in range(tensor[:, :, 0].shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        selection_masked = torch.where(mask_arr[i] == 0)[0].tolist()
        tensor[i, selection] = 4 # [MASK] tokeni ID mõlemas vocabis
        labels[i, selection_masked] = -100 # Et mudel arvutaks lossi ainult masked tokenite pealt
        
    return tensor, labels

In [17]:
%%time
# Andmestiku ettevalmistamine treenimiseks või mudelis kasutamiseks
def prepare_data(data):
    input_ids = []
    mask = []
    labels = []
    
    sample = tokenizer(data, max_length = 8, padding = "max_length", truncation = True, return_tensors = "pt")
    new_ids, new_labels = mlm(sample.input_ids.detach().clone())
    input_ids.append(new_ids)
    mask.append(sample.attention_mask)
    labels.append(new_labels)
    
    input_ids = torch.cat(input_ids)
    mask = torch.cat(mask)
    labels = torch.cat(labels)
    
    encodings = {
    "input_ids" : input_ids,
    "attention_mask" : mask,
    "label_ids" : labels
    }
    
    dataset = Dataset(encodings)
    return dataset


# Dataloaderi loomine mudeli treenimiseks
train_dataset = prepare_data(train_laused)
val_dataset = prepare_data(val_laused)
test_dataset = prepare_data(test_laused)

CPU times: total: 35.6 s
Wall time: 35.7 s


In [18]:
train_dataset[0]

{'input_ids': tensor([[    2,     2],
         [13226,     1],
         [  183,    34],
         [ 1545,    53],
         [    1,     1],
         [    4,     4],
         [ 1465,     1],
         [    3,     3]]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1]),
 'label_ids': tensor([[-100, -100],
         [-100, -100],
         [-100, -100],
         [-100, -100],
         [-100, -100],
         [1324,   30],
         [-100, -100],
         [-100, -100]])}

In [89]:
from sklearn.metrics import accuracy_score
def compute_metrics(p):
    pred, labels = p
    
    indeksid = np.where(labels[:, :, 0].flatten() != -100)[0]
    
    labels_lemma = labels[:, :, 0].flatten()[indeksid]
    labels_vorm = labels[:, :, 1].flatten()[indeksid]

    pred_lemma = np.take(np.argmax(pred[0], axis = 2).flatten(), indeksid)
    pred_vorm = np.take(np.argmax(pred[1], axis = 2).flatten(), indeksid)

    accuracy_lemma = accuracy_score(y_true=labels_lemma, y_pred=pred_lemma)
    accuracy_vorm = accuracy_score(y_true=labels_vorm, y_pred=pred_vorm)

    return {"accuracy_lemma": accuracy_lemma, "accuracy_vorm": accuracy_vorm}


In [93]:
%%time
# Mudeli treenimine

config = BertConfig(
    vocab_size = tokenizer.vocab_size,
    vocab_size_form = tokenizer.vocab_size_form,
    tie_word_embeddings = False
)

model = BertForMaskedLM(config)

training_args = TrainingArguments(
    output_dir='./train_results',
    per_device_train_batch_size=32,
    max_steps=100,
    learning_rate=1e-4,
    logging_steps=50,
    warmup_steps=10,
    save_steps=50,
    logging_dir='./train_logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 6246
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 166687180


Step,Training Loss
50,13.1093
100,11.8307


Saving model checkpoint to ./train_results\checkpoint-50
Configuration saved in ./train_results\checkpoint-50\config.json
Model weights saved in ./train_results\checkpoint-50\pytorch_model.bin
Saving model checkpoint to ./train_results\checkpoint-100
Configuration saved in ./train_results\checkpoint-100\config.json
Model weights saved in ./train_results\checkpoint-100\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: total: 34min 2s
Wall time: 4min 54s


TrainOutput(global_step=100, training_loss=12.470031127929687, metrics={'train_runtime': 292.0066, 'train_samples_per_second': 10.959, 'train_steps_per_second': 0.342, 'total_flos': 39074543616000.0, 'train_loss': 12.470031127929687, 'epoch': 0.51})

In [63]:
dataloader_test = torch.utils.data.DataLoader(train_dataset, batch_size = 16)
kokku_labelid_vorm = np.array([-1])
kokku_labelid_lemma = np.array([-1])
kokku_ennustused_vorm = np.array([-1])
kokku_ennustused_lemma = np.array([-1])
loop = tqdm(dataloader_test, leave = True)

for batch in loop:
    input_ids = batch["input_ids"]
    labels = batch["label_ids"]
    
    output_lemma, output_vorm = model(input_ids).logits
    
    indeksid = np.where(input_ids[:, :, 0].flatten() == 4)
    labels_vorm = labels[:, :, 1].flatten()[indeksid].detach().numpy()
    labels_lemma = labels[:, :, 0].flatten()[indeksid].detach().numpy()
    ennustused_vorm = np.argmax(output_vorm.flatten(end_dim = 1)[indeksid].detach().numpy(), axis = 1)
    ennustused_lemma = np.argmax(output_lemma.flatten(end_dim = 1)[indeksid].detach().numpy(), axis = 1)
    
    kokku_labelid_vorm = np.concatenate((kokku_labelid_vorm, labels_vorm))
    kokku_labelid_lemma = np.concatenate((kokku_labelid_lemma, labels_lemma))
    
    kokku_ennustused_vorm = np.concatenate((kokku_ennustused_vorm, ennustused_vorm))
    kokku_ennustused_lemma = np.concatenate((kokku_ennustused_lemma, ennustused_lemma))

100%|████████████████████████████████████████████████████████████████████████████████| 391/391 [02:08<00:00,  3.04it/s]


In [67]:
kokku_labelid_vorm
kokku_labelid_lemma

array([  -1, 1324,   77, ..., 4915,  143,   51], dtype=int64)

In [68]:
kokku_ennustused_vorm
kokku_ennustused_lemma

array([-1,  7,  7, ...,  7,  7,  7], dtype=int64)

In [69]:
kokku_labelid_vorm == kokku_ennustused_vorm
kokku_labelid_lemma == kokku_ennustused_lemma

array([ True, False, False, ..., False, False, False])

In [72]:
sum(kokku_labelid_vorm == kokku_ennustused_vorm)/len(kokku_labelid_vorm)
sum(kokku_labelid_lemma == kokku_ennustused_lemma)/len(kokku_labelid_lemma)

0.07107790821771612