In [1]:
# Impordid
from src.transformers.models.bert.tokenization_bert import BertTokenizer
from src.transformers.models.bert.modeling_bert import BertEmbeddings, BertForPreTraining, BertForMaskedLM
from src.transformers.data.datasets import TextDatasetForNextSentencePrediction
import torch
from estnltk import Text
from estnltk.corpus_processing.parse_enc import parse_enc_file_iterator
from src.transformers.models.bert.configuration_bert import BertConfig
from transformers import AdamW
from tqdm.auto import tqdm
import numpy as np

In [2]:
# Tokeniseerija
tokenizer = BertTokenizer(vocab_file = "vocab_final.txt", vocab_file_form = "vocab_form.txt")

In [3]:
%%time
# Korpus
# Loeme tekstid sisse, laseme estnltk-l laused leida ning moodustame lausetest treening- ja testhulg
# https://github.com/estnltk/estnltk/blob/main/tutorials/corpus_processing/importing_text_objects_from_corpora.ipynb
input_file = "korpus/estonian_nc17.vert.01"
n = 2 # Mitu teksti korpusesse lugeda
korpus = []
l = 0
for text_obj in parse_enc_file_iterator(input_file):
    korpus.append(text_obj)
    if l > n:
        break
    l += 1

    
train =  korpus[:int(0.8*len(korpus))]
test = korpus[int(0.8*len(korpus)):]

laused = []
for tekst in train:
    for span in tekst.original_sentences:
        laused.append(tekst.text[span.start:span.end])

CPU times: total: 78.1 ms
Wall time: 53 ms


In [4]:
# https://towardsdatascience.com/how-to-train-bert-aaad00533168
# NSP lausete loomine
laused_len = len(laused)

import random

lause_a = []
lause_b = []
label_nsp = []

for tekst in train:
    laused_tekst = [tekst.text[span.start:span.end] for span in tekst.original_sentences]
    laused_tekst_len = len(laused_tekst)
    if laused_tekst_len > 1:
        start = random.randint(0, laused_tekst_len-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            lause_a.append(laused_tekst[start])
            lause_b.append(laused_tekst[start+1])
            label_nsp.append(0)
        else:
            index = random.randint(0, laused_len-1)
            # this is NotNextSentence
            lause_a.append(laused_tekst[start])
            lause_b.append(laused[index])
            label_nsp.append(1)

In [5]:
lause_a

['Seejärel tekkisid ehitajal kohtuvaidlused ühe kinnistu õigusjärgse omanikuga ja kogu ehitus jäi seisma.',
 'Kinnistu suurus on 8745 ruutmeetrist, maa sihtotstarve on seni olnud riigikaitsemaa.',
 'EFÜ korraldab muusikaauhindade jagamist juba neljandat aastat.']

In [15]:
# Tokeniseerimine ja maskimine
inputs = tokenizer(lause_a, lause_b, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
inputs['next_sentence_label'] = torch.LongTensor([label_nsp]).T
labels_mlm = inputs.input_ids.detach().clone()[:, :, 0]
rand = torch.rand(inputs.input_ids[:, :, 0].shape)
mask_arr = (rand < 0.15) * (inputs.input_ids[:, :, 0] > 5)
for i in range(inputs.input_ids[:, :, 0].shape[0]):
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    selection_masked = torch.where(mask_arr[i] == 0)[0].tolist()
    inputs.input_ids[i, selection] = 4 # [MASK] tokeni ID mõlemas vocabis
    labels_mlm[i, selection_masked] = -100 # Et mudel arvutaks lossi ainult masked tokenite pealt
inputs["labels"] = labels_mlm

In [16]:
inputs.labels

tensor([[ -100,   603,  -100,  -100,  -100,  -100,  2301,  -100,   531,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  7296,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

In [17]:
# Dataseti loomine dataloaderi jaoks
class UusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [18]:
dataloader_train = torch.utils.data.DataLoader(UusDataset(inputs), batch_size = 32, shuffle = True)

In [19]:
%%time
# Mudeli treenimine

config = BertConfig(
    vocab_size = tokenizer.vocab_size,
    vocab_size_form = tokenizer.vocab_size_form,
    tie_word_embeddings = False
)

model = BertForPreTraining(config)
CE = torch.nn.CrossEntropyLoss()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)
optim = AdamW(model.parameters(), lr = 1e-4)
epochs = 5

for epoch in range(epochs):
    loop = tqdm(dataloader_train, leave = True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids']
        token_type_ids = batch['token_type_ids']
        attention_mask = batch['attention_mask']
        next_sentence_label = batch['next_sentence_label']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        loop.set_description(f"Epoch: {epoch}")
        loop.set_postfix(loss = loss.item())

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: total: 2min 17s
Wall time: 18.8 s


In [None]:
output_lemma.logits.size()

In [None]:
# Testimine
dataloader_test = torch.utils.data.DataLoader(prepare_data(test), batch_size = 16)
kokku_labelid_vorm = np.array([-1])
kokku_labelid_lemma = np.array([-1])
kokku_ennustused_vorm = np.array([-1])
kokku_ennustused_lemma = np.array([-1])
#loop2 = tqdm(dataloader_test, leave = True) # Testandmed
loop2 = tqdm(dataloader_train, leave = True) # Treeningandmetel võiks accuracy tulla 1 kui epocheid piisavalt
for batch in loop2:
    input_ids = batch["input_ids"]
    labels_vorm = batch["labels_vorm"]
    labels_lemma = batch["labels_lemma"]
    
    output_lemma = model(input_ids).logits
    output_vorm = linear(output_lemma)
    indeksid = np.where(input_ids[:, :, 0].flatten() == 4)
    labels_vorm = labels_vorm.flatten()[indeksid].detach().numpy()
    labels_lemma = labels_lemma.flatten()[indeksid].detach().numpy()
    ennustused_vorm = np.argmax(output_vorm.flatten(end_dim = 1)[indeksid].detach().numpy(), axis = 1)
    ennustused_lemma = np.argmax(output_lemma.flatten(end_dim = 1)[indeksid].detach().numpy(), axis = 1)
    
    kokku_labelid_vorm = np.concatenate((kokku_labelid_vorm, labels_vorm))
    kokku_labelid_lemma = np.concatenate((kokku_labelid_lemma, labels_lemma))
    
    kokku_ennustused_vorm = np.concatenate((kokku_ennustused_vorm, ennustused_vorm))
    kokku_ennustused_lemma = np.concatenate((kokku_ennustused_lemma, ennustused_lemma))

In [None]:
kokku_labelid_vorm
kokku_labelid_lemma

In [None]:
kokku_ennustused_vorm
kokku_ennustused_lemma

In [None]:
sum(kokku_ennustused_vorm[1:] == kokku_labelid_vorm[1:])/len(kokku_labelid_vorm[1:])
sum(kokku_ennustused_lemma[1:] == kokku_labelid_lemma[1:])/len(kokku_labelid_lemma[1:])

In [None]:
#torch.save(model, "mudel.pth")