In [257]:
# Impordid
from src.transformers.models.bert.tokenization_bert import BertTokenizer
from src.transformers.models.bert.modeling_bert import BertEmbeddings, BertModel, BertForMaskedLM
import torch
from estnltk import Text
from estnltk.corpus_processing.parse_enc import parse_enc_file_iterator
from src.transformers.models.bert.configuration_bert import BertConfig
from transformers import AdamW
from tqdm.auto import tqdm
from transformers import pipeline
import numpy as np

In [258]:
# Tokeniseerija
tokenizer = BertTokenizer(vocab_file = "vocab2.txt", vocab_file_form = "vocab_form.txt")

In [259]:
%%time
# Korpus
# Loeme tekstid sisse, laseme estnltk-l laused leida ning moodustame lausetest treening- ja testhulga
input_file = "estonian_nc17.vert"
n = 100 # Mitu teksti korpusesse lugeda
korpus = []
l = 0
for text_obj in parse_enc_file_iterator(input_file):
    korpus.append(text_obj.text)
    if l > n:
        break
    l += 1

tekst = Text(" ".join(korpus)).tag_layer()
laused = []
for span in tekst.sentences:
    laused.append(tekst.text[span.start:span.end])
train =  laused[:int(0.8*len(laused))]
test = laused[int(0.8*len(laused)):]

CPU times: total: 11.2 s
Wall time: 11.2 s


In [260]:
# Dataseti loomine dataloaderi jaoks
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings["input_ids"].shape[0]
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [261]:
sample = tokenizer(train[0:2], max_length = 8, padding = "max_length", truncation = True, return_tensors = "pt")
tensor = sample.input_ids.detach().clone()
labels = tensor.detach().clone()
rand = torch.rand(tensor[:, :, 0].shape)
mask_arr = (rand < 0.15) * (tensor[:, :, 0] > 5)
for i in range(tensor[:, :, 0].shape[0]):
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    selection_masked = torch.where(mask_arr[i] == 0)[0].tolist()
    tensor[i, selection] = 4 # [MASK] tokeni ID mõlemas vocabis
    labels[i, selection_masked] = -100 # Et mudel arvutaks lossi ainult masked tokenite pealt

In [262]:
# Maskimine ja labelid
def mlm(tensor):
    labels = tensor.detach().clone()
    rand = torch.rand(tensor[:, :, 0].shape)
    mask_arr = (rand < 0.15) * (tensor[:, :, 0] > 5)
    for i in range(tensor[:, :, 0].shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        selection_masked = torch.where(mask_arr[i] == 0)[0].tolist()
        tensor[i, selection] = 4 # [MASK] tokeni ID mõlemas vocabis
        labels[i, selection_masked] = -100 # Et mudel arvutaks lossi ainult masked tokenite pealt
        
    return tensor, labels

In [263]:
# Andmestiku ettevalmistamine treenimiseks või mudelis kasutamiseks
def prepare_data(data):
    input_ids = []
    mask = []
    labels = []
    
    sample = tokenizer(data, max_length = 8, padding = "max_length", truncation = True, return_tensors = "pt")
    new_ids, new_labels = mlm(sample.input_ids.detach().clone())
    input_ids.append(new_ids)
    mask.append(sample.attention_mask)
    labels.append(new_labels[:, :, 1]) # Labeliks võtame sõnavormi id
    
    
    input_ids = torch.cat(input_ids)
    mask = torch.cat(mask)
    labels = torch.cat(labels)
    
    encodings = {
    "input_ids" : input_ids,
    "attention_mask" : mask,
    "labels" : labels
    }
    
    dataset = Dataset(encodings)
    return dataset


# Dataloaderi loomine mudeli treenimiseks
dataloader_train = torch.utils.data.DataLoader(prepare_data(train), batch_size = 16, shuffle = True)

In [269]:
%%time
# Mudeli treenimine

config = BertConfig(
    vocab_size = tokenizer.vocab_size,
    vocab_size_form = tokenizer.vocab_size_form,
    tie_word_embeddings = False
)

model = BertForMaskedLM(config)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)
optim = AdamW(model.parameters(), lr = 1e-4)
epochs = 10

for epoch in range(epochs):
    loop = tqdm(dataloader_train, leave = True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask = mask, labels = labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

        loop.set_description(f"Epoch: {epoch}")
        loop.set_postfix(loss = loss.item())

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

CPU times: total: 1h 56min 38s
Wall time: 14min 36s


In [270]:
# Testimine
dataloader_test = torch.utils.data.DataLoader(prepare_data(train), batch_size = 16)
kokku_labelid = np.array([-1])
kokku_ennustused = np.array([-1])
loop2 = tqdm(dataloader_test, leave = True)
for batch in loop2:
    input_ids = batch["input_ids"].to(device)
    labels = batch["labels"]
    outputs = model(input_ids)
    indeksid = np.where(input_ids[:, :, 0].flatten() == 4)
    labelid = labels.flatten()[indeksid].detach().numpy()
    ennustused = np.argmax(outputs.logits.flatten(end_dim = 1)[indeksid].detach().numpy(), axis = 1)
    kokku_labelid = np.concatenate((labelid, kokku_labelid))
    kokku_ennustused = np.concatenate((ennustused, kokku_ennustused))

  0%|          | 0/55 [00:00<?, ?it/s]

In [271]:
sum(kokku_ennustused[:-1] == kokku_labelid[:-1])/len(kokku_labelid)

0.24963072378138848

In [268]:
batch

{'input_ids': tensor([[[    2,     2],
          [    1,    34],
          [    7,    90],
          [  309,    30],
          [    1,    30],
          [ 6357,    34],
          [    4,     4],
          [    3,     3]],
 
         [[    2,     2],
          [    1,    32],
          [  478,    82],
          [    1,    34],
          [   24,     1],
          [ 1953,     5],
          [   23,     1],
          [    3,     3]],
 
         [[    2,     2],
          [    1,    34],
          [  704,    90],
          [    1,    25],
          [ 1189,    56],
          [    1,     5],
          [  562,    35],
          [    3,     3]],
 
         [[    2,     2],
          [    1,    34],
          [  235,    53],
          [  544,    32],
          [27474,    30],
          [    8,     1],
          [ 4956,    30],
          [    3,     3]],
 
         [[    2,     2],
          [ 1969,    34],
          [   28,    90],
          [  477,     1],
          [    1,    35],
          [  

In [86]:
query = 'Mul [MASK] und'
inp = tokenizer(query,return_tensors='pt')
mask_loc = np.where(inp.input_ids.numpy()[0] == 4)[0].tolist()[0]
out = model(inp["input_ids"]).logits[0].detach().numpy()
predicted_tokens = np.argmax(out[mask_loc]).tolist()
tokenizer.decode(predicted_tokens, return_form = True)
# tokenizer.decode praegu


'b'

In [12]:
tokenizer.added_tokens_decoder

{}

In [None]:
model(inp["input_ids"]).logits[0]

In [None]:
sisend = [id[0] for id in tokenizer('Tallinna [MASK] algatab Paldiski maantee ääres Hotell Tallinna kõrval asuva vundamendiaugu.').input_ids]
tokenizer.decode(sisend)
tokenizer("Tere mina olen keegi")

# Küsimused

1) Masked LM - õige lähenemine? jah
2) Kas peaks vocabi ka ise sisendi põhjal välja töötama? Ise arvan et jah, praegu päris paljud [UNK] teee oma sõnastik  jah

fikseerida tühi vorm
 - tokeniseerija võiks suvalise lemma võtta
lausealgus... mida ta sellega teeb? ega ta seda tavalise sõnana ei käsitle?
vormi ja lemmat korraga
softmax/CE - ennustus tõenäosusteks
samas treenida vormi enne ja lemmat pärast samas mudelis
loss = CE(V) + CE(L) see ümber kirjutada
- attention_mask, mis see on? V: sisuliselt [CLS] lause [SEP] saavad 1 ja [PAD] saab 0
teha notebook, mis teeb kogu asja ühe lause peal...
stack tracing - iga meetodi erroriga välja kutsuda, et teada kus see tuli

claudia kittask, meelis perli, ilmselt pole abi...
teises branchis stack tracimine...