In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors
import spacy


import time
import random

In [None]:
def read_dataset(corpus, fields):
    with open(corpus, encoding='utf-8') as corpus:
        examples = []
        words = []
        pos_tags = []
        for line in corpus:
            if line[0] == '#': 
                continue            
            line = line.strip()
            if not line:
                examples.append(torchtext.data.Example.fromlist([words, pos_tags], fields))
                words = []
                pos_tags = []
            else:
                columns = line.split('\t')
                if '.' in columns[0] or '-' in columns[0]:
                    continue
                words.append(columns[1])
                pos_tags.append(columns[3])
        return torchtext.data.Dataset(examples, fields)

In [None]:
class BiLSTMPOSTagger(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers = n_layers, 
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)
        
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        embedded = self.dropout(self.embedding(text))
        
        outputs, (hidden, cell) = self.lstm(embedded)
        
        predictions = self.fc(self.dropout(outputs))
                
        return predictions

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

In [None]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True) 
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [None]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        text = batch.text
        tags = batch.udtags
        optimizer.zero_grad()
        predictions = model(text)
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        loss = criterion(predictions, tags)     
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()  
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.udtags
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            loss = criterion(predictions, tags)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def build_model(dataset, embedding, text, tags, pretrained=None):
    
    text.build_vocab(dataset, 
                 min_freq = 2,
                 vectors = embedding,
                 unk_init = torch.Tensor.normal_)
    tags.build_vocab(dataset)

    model = BiLSTMPOSTagger(len(text.vocab), 
                        300, 
                        100, 
                        len(tags.vocab), 
                        2, 
                        bidirectional=True, 
                        dropout=0.25, 
                        pad_idx=text.vocab.stoi[text.pad_token])
    model.embedding.weight.data.copy_(text.vocab.vectors)
    model.embedding.weight.data[text.vocab.stoi[text.pad_token]] = torch.zeros(300)

    if pretrained:
        weights = torch.load(pretrained)
        del weights['embedding.weight']
        model.load_state_dict(weights, strict=False)
    else:
        model.apply(init_weights)
    
    return model

In [None]:
def train_model(model, name, data_train, data_dev, optimizer, criterion, batch_size=128, n_epochs=10):
    train_iterator, valid_iterator = data.BucketIterator.splits((data_train, data_dev), 
                                                                batch_size = batch_size, 
                                                                sort=False)
    best_valid_loss = float('inf')

    for epoch in range(n_epochs):

        start_time = time.time()
        
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
        
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), name + '.pt')
        
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')    



---



---



In [None]:
!gunzip /content/drive/My\ Drive/diploma/cc.uk.300.vec.gz

In [None]:
SEED = 111

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
ru_vec_emb = '/content/drive/My Drive/diploma/cc.ru.300.vec'
uk_vec_emb = '/content/drive/My Drive/diploma/cc.uk.300.vec'
uk_vec = Vectors(name=uk_vec_emb)
ru_vec = Vectors(name=ru_vec_emb)

In [None]:
TEXT = data.Field(lower = True)
UD_TAGS = data.Field(unk_token = None)
fields = (("text", TEXT), ("udtags", UD_TAGS))

In [None]:
uk_train = read_dataset('/content/drive/My Drive/diploma/uk_iu-ud-train.conllu', fields)
uk_dev = read_dataset('/content/drive/My Drive/diploma/uk_iu-ud-dev.conllu.txt', fields)
uk_test = read_dataset('/content/drive/My Drive/diploma/uk_iu-ud-test.conllu.txt', fields)

ru_train = read_dataset('/content/drive/My Drive/diploma/ru_syntagrus-ud-train.conllu.txt', fields)
ru_dev = read_dataset('/content/drive/My Drive/diploma/ru_syntagrus-ud-dev.conllu.txt', fields)
ru_test = read_dataset('/content/drive/My Drive/diploma/ru_syntagrus-ud-test.conllu.txt', fields)

In [None]:
uk_train_low, _ = torchtext.data.Dataset.split(uk_train, 0.25, random_state=random.seed(SEED))

In [None]:
len(uk_train_low)

1374

## Train on ukrainian dataset

In [None]:
uk_model = build_model(uk_train, uk_vec, TEXT, UD_TAGS)

In [None]:
optimizer_uk = optim.Adam(uk_model.parameters())

TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion_uk = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [None]:
train_model(uk_model, "ukrainian_full", uk_train, uk_dev, optimizer_uk, criterion_uk)

Epoch: 01 | Epoch Time: 1m 5s
	Train Loss: 2.125 | Train Acc: 33.06%
	 Val. Loss: 1.497 |  Val. Acc: 53.66%
Epoch: 02 | Epoch Time: 0m 51s
	Train Loss: 1.151 | Train Acc: 63.33%
	 Val. Loss: 0.778 |  Val. Acc: 73.96%
Epoch: 03 | Epoch Time: 0m 48s
	Train Loss: 0.660 | Train Acc: 78.47%
	 Val. Loss: 0.557 |  Val. Acc: 81.19%
Epoch: 04 | Epoch Time: 0m 46s
	Train Loss: 0.462 | Train Acc: 85.38%
	 Val. Loss: 0.481 |  Val. Acc: 83.90%
Epoch: 05 | Epoch Time: 0m 56s
	Train Loss: 0.376 | Train Acc: 88.13%
	 Val. Loss: 0.447 |  Val. Acc: 84.87%
Epoch: 06 | Epoch Time: 0m 52s
	Train Loss: 0.332 | Train Acc: 89.31%
	 Val. Loss: 0.458 |  Val. Acc: 84.58%
Epoch: 07 | Epoch Time: 0m 46s
	Train Loss: 0.301 | Train Acc: 90.27%
	 Val. Loss: 0.440 |  Val. Acc: 85.07%
Epoch: 08 | Epoch Time: 0m 47s
	Train Loss: 0.275 | Train Acc: 91.04%
	 Val. Loss: 0.440 |  Val. Acc: 85.28%
Epoch: 09 | Epoch Time: 0m 57s
	Train Loss: 0.251 | Train Acc: 91.83%
	 Val. Loss: 0.450 |  Val. Acc: 85.14%
Epoch: 10 | Epoch Ti

In [None]:
test_iterator = data.BucketIterator(uk_test, batch_size = 128, sort=False)

test_loss, test_acc = evaluate(uk_model, test_iterator, criterion_uk, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.478 |  Test Acc: 83.73%


## Train on low-resource scenario, Ukrainian dataset

In [None]:
uk_model_low = build_model(uk_train_low, uk_vec, TEXT, UD_TAGS)

In [None]:
optimizer_uk_l = optim.Adam(uk_model_low.parameters())

TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion_uk_l = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [None]:
train_model(uk_model_low, "ukrainian_low", uk_train_low, uk_dev, optimizer_uk_l, criterion_uk_l)

Epoch: 01 | Epoch Time: 0m 14s
	Train Loss: 2.531 | Train Acc: 23.07%
	 Val. Loss: 2.180 |  Val. Acc: 31.85%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 2.172 | Train Acc: 31.25%
	 Val. Loss: 1.966 |  Val. Acc: 38.90%
Epoch: 03 | Epoch Time: 0m 18s
	Train Loss: 1.929 | Train Acc: 38.32%
	 Val. Loss: 1.690 |  Val. Acc: 45.46%
Epoch: 04 | Epoch Time: 0m 16s
	Train Loss: 1.634 | Train Acc: 47.16%
	 Val. Loss: 1.395 |  Val. Acc: 57.45%
Epoch: 05 | Epoch Time: 0m 15s
	Train Loss: 1.346 | Train Acc: 57.72%
	 Val. Loss: 1.171 |  Val. Acc: 62.98%
Epoch: 06 | Epoch Time: 0m 13s
	Train Loss: 1.125 | Train Acc: 63.58%
	 Val. Loss: 1.015 |  Val. Acc: 66.93%
Epoch: 07 | Epoch Time: 0m 14s
	Train Loss: 0.963 | Train Acc: 68.47%
	 Val. Loss: 0.912 |  Val. Acc: 69.98%
Epoch: 08 | Epoch Time: 0m 12s
	Train Loss: 0.850 | Train Acc: 71.91%
	 Val. Loss: 0.841 |  Val. Acc: 71.31%
Epoch: 09 | Epoch Time: 0m 13s
	Train Loss: 0.768 | Train Acc: 74.57%
	 Val. Loss: 0.792 |  Val. Acc: 73.33%
Epoch: 10 | Epoch T

In [None]:
test_iterator = data.BucketIterator(uk_test, batch_size = 128, sort=False)

test_loss, test_acc = evaluate(uk_model_low, test_iterator, criterion_uk_l, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.787 |  Test Acc: 74.06%


## Train on Russian, test on Ukrainian

In [None]:
ru_model = build_model(ru_train, ru_vec, TEXT, UD_TAGS)

In [None]:
optimizer_ru = optim.Adam(ru_model.parameters())

TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion_ru = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [None]:
train_model(ru_model, "russian_full", ru_train, ru_dev, optimizer_ru, criterion_ru)

Epoch: 01 | Epoch Time: 6m 25s
	Train Loss: 0.643 | Train Acc: 79.24%
	 Val. Loss: 0.218 |  Val. Acc: 92.33%
Epoch: 02 | Epoch Time: 6m 17s
	Train Loss: 0.147 | Train Acc: 95.04%
	 Val. Loss: 0.184 |  Val. Acc: 93.44%
Epoch: 03 | Epoch Time: 6m 24s
	Train Loss: 0.108 | Train Acc: 96.28%
	 Val. Loss: 0.181 |  Val. Acc: 93.64%
Epoch: 04 | Epoch Time: 6m 19s
	Train Loss: 0.087 | Train Acc: 97.01%
	 Val. Loss: 0.184 |  Val. Acc: 93.83%
Epoch: 05 | Epoch Time: 6m 24s
	Train Loss: 0.070 | Train Acc: 97.63%
	 Val. Loss: 0.193 |  Val. Acc: 93.73%
Epoch: 06 | Epoch Time: 6m 19s
	Train Loss: 0.056 | Train Acc: 98.11%
	 Val. Loss: 0.207 |  Val. Acc: 93.76%
Epoch: 07 | Epoch Time: 6m 21s
	Train Loss: 0.044 | Train Acc: 98.52%
	 Val. Loss: 0.229 |  Val. Acc: 93.62%
Epoch: 08 | Epoch Time: 6m 11s
	Train Loss: 0.035 | Train Acc: 98.82%
	 Val. Loss: 0.253 |  Val. Acc: 93.46%
Epoch: 09 | Epoch Time: 6m 12s
	Train Loss: 0.028 | Train Acc: 99.06%
	 Val. Loss: 0.267 |  Val. Acc: 93.45%
Epoch: 10 | Epoch T

accuracy on russian

In [None]:
test_iterator = data.BucketIterator(ru_test, batch_size = 128, sort=False)

test_loss, test_acc = evaluate(ru_model, test_iterator, criterion_ru, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.291 |  Test Acc: 93.38%


In [None]:
ru_uk_model = build_model(uk_train, uk_vec, TEXT, UD_TAGS, pretrained='russian_full.pt')

In [None]:
test_iterator = data.BucketIterator(uk_test, batch_size = 128, sort=False)

test_loss, test_acc = evaluate(ru_uk_model, test_iterator, criterion_uk, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 6.325 |  Test Acc: 13.37%


## Train Russian model on ukrainian, low-resource scenario

In [None]:
ru_uk_model = build_model(uk_train_low, uk_vec, TEXT, UD_TAGS, pretrained='russian_full.pt')

In [None]:
optimizer_ru_uk = optim.Adam(ru_uk_model.parameters(), lr=0.0007)

TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion_ru_uk = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [None]:
train_model(ru_uk_model, "ru+low_uk", uk_train_low, uk_dev, optimizer_ru_uk, criterion_ru_uk)

Epoch: 01 | Epoch Time: 0m 15s
	Train Loss: 3.605 | Train Acc: 25.37%
	 Val. Loss: 1.719 |  Val. Acc: 49.78%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 1.582 | Train Acc: 51.99%
	 Val. Loss: 1.254 |  Val. Acc: 59.09%
Epoch: 03 | Epoch Time: 0m 17s
	Train Loss: 1.190 | Train Acc: 62.17%
	 Val. Loss: 1.040 |  Val. Acc: 68.19%
Epoch: 04 | Epoch Time: 0m 16s
	Train Loss: 0.954 | Train Acc: 71.13%
	 Val. Loss: 0.895 |  Val. Acc: 72.95%
Epoch: 05 | Epoch Time: 0m 14s
	Train Loss: 0.791 | Train Acc: 76.58%
	 Val. Loss: 0.800 |  Val. Acc: 75.74%
Epoch: 06 | Epoch Time: 0m 13s
	Train Loss: 0.686 | Train Acc: 79.65%
	 Val. Loss: 0.739 |  Val. Acc: 77.00%
Epoch: 07 | Epoch Time: 0m 14s
	Train Loss: 0.619 | Train Acc: 81.25%
	 Val. Loss: 0.709 |  Val. Acc: 77.54%
Epoch: 08 | Epoch Time: 0m 12s
	Train Loss: 0.575 | Train Acc: 82.49%
	 Val. Loss: 0.689 |  Val. Acc: 77.98%
Epoch: 09 | Epoch Time: 0m 13s
	Train Loss: 0.544 | Train Acc: 82.97%
	 Val. Loss: 0.670 |  Val. Acc: 78.10%
Epoch: 10 | Epoch T

In [None]:
test_iterator = data.BucketIterator(uk_test, batch_size = 128, sort=False)

test_loss, test_acc = evaluate(ru_uk_model, test_iterator, criterion_ru_uk, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.696 |  Test Acc: 77.12%


## Train Russian model on ukrainian, under-resource scenario

In [None]:
ru_uk_model_f = build_model(uk_train, uk_vec, TEXT, UD_TAGS, pretrained='russian_full.pt')

In [None]:
optimizer_ru_uk_f = optim.Adam(ru_uk_model_f.parameters(), lr=0.0007)

TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

criterion_ru_uk_f = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [None]:
train_model(ru_uk_model_f, "ru+f_uk", uk_train, uk_dev, optimizer_ru_uk_f, criterion_ru_uk_f)

Epoch: 01 | Epoch Time: 0m 49s
	Train Loss: 1.895 | Train Acc: 50.57%
	 Val. Loss: 0.839 |  Val. Acc: 75.23%
Epoch: 02 | Epoch Time: 0m 49s
	Train Loss: 0.655 | Train Acc: 80.80%
	 Val. Loss: 0.541 |  Val. Acc: 83.31%
Epoch: 03 | Epoch Time: 0m 49s
	Train Loss: 0.435 | Train Acc: 87.03%
	 Val. Loss: 0.479 |  Val. Acc: 84.27%
Epoch: 04 | Epoch Time: 0m 58s
	Train Loss: 0.364 | Train Acc: 88.73%
	 Val. Loss: 0.452 |  Val. Acc: 85.02%
Epoch: 05 | Epoch Time: 0m 46s
	Train Loss: 0.323 | Train Acc: 89.81%
	 Val. Loss: 0.442 |  Val. Acc: 85.11%
Epoch: 06 | Epoch Time: 0m 47s
	Train Loss: 0.295 | Train Acc: 90.59%
	 Val. Loss: 0.442 |  Val. Acc: 85.09%
Epoch: 07 | Epoch Time: 0m 46s
	Train Loss: 0.273 | Train Acc: 91.14%
	 Val. Loss: 0.441 |  Val. Acc: 84.97%
Epoch: 08 | Epoch Time: 1m 0s
	Train Loss: 0.253 | Train Acc: 91.79%
	 Val. Loss: 0.446 |  Val. Acc: 85.25%
Epoch: 09 | Epoch Time: 0m 47s
	Train Loss: 0.233 | Train Acc: 92.47%
	 Val. Loss: 0.445 |  Val. Acc: 84.97%
Epoch: 10 | Epoch Ti

In [None]:
test_iterator = data.BucketIterator(uk_test, batch_size = 128, sort=False)

test_loss, test_acc = evaluate(ru_uk_model_f, test_iterator, criterion_ru_uk_f, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.467 |  Test Acc: 84.53%


In [None]:
!cp ru* /content/drive/My\ Drive/diploma

In [None]:
!cp uk* /content/drive/My\ Drive/diploma

In [None]:
def draw_heatmap(model, iterator, criterion, tag_pad_idx):
    real = []
    predicted = []
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.udtags
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            real.append(tags)
            predicted.append(predictions)
    return real[0], predicted[0]

In [None]:
m = '/content/drive/My Drive/diploma/ru+low_uk.pt'
m = build_model(uk_train_low, uk_vec, TEXT, UD_TAGS, pretrained=m)

In [None]:
tir = data.BucketIterator(uk_test, batch_size = 128, sort=False)
TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
crit= nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)
test_loss, test_acc = evaluate(m, tir, crit, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.324 |  Test Acc: 60.59%


In [None]:
a, b = draw_heatmap(m, tir, crit, TAG_PAD_IDX)