In [1]:
# Impordid
from src.transformers.models.bert.tokenization_bert import BertTokenizer
from src.transformers.models.bert.modeling_bert import BertEmbeddings, BertModel, BertForMaskedLM
import torch
from estnltk import Text
from estnltk.corpus_processing.parse_enc import parse_enc_file_iterator
from src.transformers.models.bert.configuration_bert import BertConfig
from transformers import AdamW
from tqdm.auto import tqdm
from transformers import pipeline
import numpy as np

In [2]:
# Tokeniseerija
tokenizer = BertTokenizer(vocab_file = "vocab.txt", vocab_file_form = "vocab_form.txt")

In [3]:
%%time
# Korpus
# Loeme tekstid sisse, laseme estnltk-l laused leida ning moodustame lausetest treening- ja testhulga
input_file = "estonian_nc17.vert"
n = 100 # Mitu teksti korpusesse lugeda
korpus = []
l = 0
for text_obj in parse_enc_file_iterator(input_file):
    korpus.append(text_obj.text)
    if l > n:
        break
    l += 1

tekst = Text(" ".join(korpus)).tag_layer()
laused = []
for span in tekst.sentences:
    laused.append(tekst.text[span.start:span.end])
train =  laused[:int(0.8*len(laused))]
test = laused[int(0.8*len(laused)):]

CPU times: total: 11.2 s
Wall time: 11.5 s


In [4]:
# Dataseti loomine dataloaderi jaoks
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings["input_ids"].shape[0]
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [5]:
# Maskimine
def mlm(tensor):
    rand = torch.rand(tensor[:, :, 0].shape)
    mask_arr = (rand < 0.15) * (tensor[:, :, 0] > 5) # 15% sõnadest maskitakse
    for i in range(tensor[:, :, 0].shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4 # [MASK] tokeni ID mõlemas vocabis
    return tensor

In [6]:
# Andmestiku ettevalmistamine treenimiseks või mudelis kasutamiseks
def prepare_data(data):
    input_ids = []
    mask = []
    labels = []
    
    sample = tokenizer(data, max_length = 8, padding = "max_length", truncation = True, return_tensors = "pt")
    labels.append(sample.input_ids[:, :, 1]) # Labeliks võtame sõnavormi id
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))
    
    input_ids = torch.cat(input_ids)
    mask = torch.cat(mask)
    labels = torch.cat(labels)
    
    encodings = {
    "input_ids" : input_ids,
    "attention_mask" : mask,
    "labels" : labels
    }
    
    dataset = Dataset(encodings)
    return dataset


# Dataloaderi loomine mudeli treenimiseks
dataloader_train = torch.utils.data.DataLoader(prepare_data(train), batch_size = 16, shuffle = True)

In [7]:
%%time
# Mudeli treenimine

config = BertConfig(
    vocab_size = tokenizer.vocab_size,
    vocab_size_form = tokenizer.vocab_size_form,
    tie_word_embeddings = False
)

model = BertForMaskedLM(config)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)
optim = AdamW(model.parameters(), lr = 1e-4)
epochs = 3

for epoch in range(epochs):
    loop = tqdm(dataloader_train, leave = True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask = mask, labels = labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

        loop.set_description(f"Epoch: {epoch}")
        loop.set_postfix(loss = loss.item())



  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

CPU times: total: 33min 10s
Wall time: 4min 10s


In [8]:
# Testimine

dataloader_test = torch.utils.data.DataLoader(prepare_data(test), batch_size = 16)

loop2 = tqdm(dataloader_test, leave = True)
for batch in loop2:
    input_ids = batch["input_ids"].to(device)
    outputs = model(input_ids)
    # Implementeerida täpsuse leidmine, kui muu kõik töötab

  0%|          | 0/14 [00:00<?, ?it/s]

In [9]:
outputs.logits[0]

tensor([[ 0.3600,  2.9466, 14.8121,  ...,  0.0407, -0.6803, -0.3687],
        [ 1.7117,  2.6505,  0.2391,  ..., -1.2733, -0.9795, -1.7471],
        [ 4.9232,  2.9751, -0.6083,  ..., -0.8428, -0.2051, -1.3303],
        ...,
        [ 2.6288,  3.2083,  2.5195,  ..., -0.9176, -1.8177, -1.1489],
        [ 2.1902,  2.3576, -0.0580,  ..., -1.1242, -0.4858, -0.7187],
        [ 2.4641,  1.6400,  0.1693,  ..., -0.6845, -1.1551, -0.1845]],
       grad_fn=<SelectBackward0>)

In [19]:
query = 'Tallinna [MASK] algatab Paldiski maantee ääres Hotell Tallinna kõrval asuva vundamendiaugu.'
inp = tokenizer(query,return_tensors='pt')
mask_loc = np.where(inp.input_ids.numpy()[0] == 4)[0].tolist()[0]
out = model(inp["input_ids"]).logits[0].detach().numpy()
predicted_tokens = np.argmax(out[mask_loc]).tolist()
tokenizer.decode(predicted_tokens)
# tokenizer.decode praegu 

'# # l a'

In [24]:
sisend = [id[0] for id in tokenizer('Tallinna [MASK] algatab Paldiski maantee ääres Hotell Tallinna kõrval asuva vundamendiaugu.').input_ids]
tokenizer.decode(sisend)

'[CLS] tallinna [MASK] [UNK] [UNK] maantee ääres hotell tallinna kõrval asuv [UNK]. [SEP]'

# Küsimused

1) Masked LM - õige lähenemine?
2) Kas peaks vocabi ka ise sisendi põhjal välja töötama? Ise arvan et jah, praegu päris paljud [UNK]
