In [116]:
import time
from bz2 import BZ2File
from collections import OrderedDict

from conllu import parse, parse_tree

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [2]:
!ls *.bz2

fiction.cased.lemmatized.300d.bz2
fiction.cased.lemmatized.word2vec.300d.bz2


In [3]:
def load_vecs(bz2_path):
    map2id = {}
    map2word = {}
    weights = None
    i = 0
    with BZ2File(bz2_path) as archive:
        line = archive.readline()
        while line:
            line = line.decode('utf-8')
            if i == 0:
                sizes = [int(s.strip()) for s in line.split(' ')]
                print('input sizes: ', sizes)
                weights = np.zeros(sizes)
            else:
                chunks = line.split(' ', maxsplit=1)
                word, num_string = chunks[0], chunks[1]
                nums = np.fromstring(num_string, sep=' ')
                weights[i - 1, :] = nums
                map2id[word] = i - 1
                map2word[i - 1] = word
            line = archive.readline()
            i += 1
            
    print(f'read {i} total lines')
    return weights, map2id, map2word

In [4]:
def read_conllu_file(filename):
    with open(filename) as input_file:
        text = input_file.read()
        result = parse(text)
    return result
    
    
train = read_conllu_file('../../../../UD_Ukrainian-IU/uk_iu-ud-train.conllu')
test = read_conllu_file('../../../../UD_Ukrainian-IU/uk_iu-ud-dev.conllu')
val = read_conllu_file('../../../../UD_Ukrainian-IU/uk_iu-ud-test.conllu')

In [5]:
deprels = set()
for sent in train:
    for tok in sent:
        deprels.update([tok['deprel']])
deprels = list(deprels)

In [6]:
OUTPUT_SIZE = len(deprels)
labeler = LabelEncoder()
labeler.fit(deprels)

LabelEncoder()

In [7]:
embedding_matrix, map2id, map2word = load_vecs("fiction.cased.lemmatized.300d.bz2")
# embedding_matrix, map2id, map2word = load_vecs("fiction.cased.lemmatized.word2vec.300d.bz2")

input sizes:  [59181, 300]
read 59182 total lines


In [8]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def unwrap_gold_relations(tree):
    return [(tok['id'], head(tok)) for tok in tree]

def head(tok):
    return tok['head'] if 'head' in tok else 0

def vectorize(sentences, map2id, labeler):
    output = pd.DataFrame()
    for sentence in sentences:
        rels = unwrap_gold_relations(sentence)
        toks = [ROOT] + sentence
        for (child, head) in rels:
            output = output.append({'head': map2id[toks[head]['lemma']] if toks[head]['lemma'] in map2id else 0,
                                    'child': map2id[toks[child]['lemma']] if toks[child]['lemma'] in map2id else 0,
                                    'deprel': toks[child]['deprel']}, ignore_index=True)
    
    return output[['child', 'head']].astype(int), labeler.transform(output['deprel'].astype(str))

In [9]:
class RelationsData(Dataset):
    def __init__(self, Xs, ys):
        super().__init__()
        
        self.Xs = Xs
        self.ys = ys
        self.size = len(Xs)
        

    def __getitem__(self, index):
        x = self.Xs[index:index+1].values[0]
        target = self.ys[index]
        return x, target

    def __len__(self):
        return self.size

In [68]:
class RelationModel(nn.Module):
    def __init__(self, embedding_matrix, output_size):
        super(RelationModel, self).__init__()
        
        vocab_size = embedding_matrix.shape[0]
        vector_size = embedding_matrix.shape[1]
        
        self.embeddings = nn.Embedding(vocab_size, vector_size)
        self.embeddings.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.dense1 = nn.Linear(vector_size * 2, 1000)
        self.activation1 = nn.ReLU()
        self.dense2 = nn.Linear(1000, output_size)
        self.activation2 = nn.Sigmoid()
    def forward(self, in1, in2):
        emb1 = self.embeddings(in1)
        emb2 = self.embeddings(in2)
        in_cat = torch.cat([emb1, emb2], 1)
        out = self.dense1(in_cat)
        out = self.activation1(out)
        out = self.dense2(out)
        y_pred = F.softmax(out, dim=1)
        return y_pred

In [11]:
start = time.time()
X_train, y_train = vectorize(train, map2id, labeler)
print(f'finished in {time.time() - start:.2f}s')

finished in 307.47s


In [12]:
start = time.time()
X_test, y_test = vectorize(test, map2id, labeler)
print(f'finished in {time.time() - start:.2f}s')

finished in 28.34s


In [105]:
batch_size = 512
num_workers = 8
lr = 0.000001
num_epochs = 20
print_freq = 10000

In [118]:
ds_train = RelationsData(X_train, y_train)
loader_train = DataLoader(ds_train, batch_size=batch_size, num_workers=num_workers, drop_last=True)

ds_test = RelationsData(X_test, y_test)
loader_test = DataLoader(ds_test, batch_size=batch_size, num_workers=num_workers)

In [62]:
def create_model(embedding_matrix, output_size):
    return RelationModel(embedding_matrix, output_size)

In [69]:
model = create_model(embedding_matrix, output_size=OUTPUT_SIZE)
criterion = nn.CrossEntropyLoss()

In [102]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [106]:
print_freq = 50
for epoch in range(num_epochs):
    for step, (x, y) in enumerate(tqdm(loader_train)):
        x1 = x[:, 0].type(torch.LongTensor)
        x2 = x[:, 1].type(torch.LongTensor)

        y_pred = model(x1, x2)
        loss = criterion(y_pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (step % print_freq) == 0 and step > 0:
            print(step, loss.item())
print('last loss: ', loss.item())

 35%|███▍      | 51/146 [00:13<00:23,  4.01it/s]

50 3.117732286453247


 69%|██████▉   | 101/146 [00:26<00:11,  4.02it/s]

100 3.1844699382781982


100%|██████████| 146/146 [00:37<00:00,  4.10it/s]
 35%|███▍      | 51/146 [00:13<00:23,  3.99it/s]

50 3.1176395416259766


 69%|██████▉   | 101/146 [00:25<00:11,  4.04it/s]

100 3.1843268871307373


100%|██████████| 146/146 [00:36<00:00,  4.14it/s]
 35%|███▍      | 51/146 [00:12<00:23,  4.07it/s]

50 3.1175529956817627


 69%|██████▉   | 101/146 [00:25<00:11,  4.07it/s]

100 3.184185266494751


100%|██████████| 146/146 [00:36<00:00,  4.12it/s]
 35%|███▍      | 51/146 [00:13<00:23,  4.08it/s]

50 3.1174721717834473


 69%|██████▉   | 101/146 [00:25<00:11,  4.08it/s]

100 3.1840474605560303


100%|██████████| 146/146 [00:36<00:00,  4.09it/s]
 35%|███▍      | 51/146 [00:13<00:23,  4.10it/s]

50 3.1173946857452393


 69%|██████▉   | 101/146 [00:25<00:11,  3.94it/s]

100 3.1839089393615723


100%|██████████| 146/146 [00:36<00:00,  4.13it/s]
 35%|███▍      | 51/146 [00:13<00:23,  4.06it/s]

50 3.117323160171509


 69%|██████▉   | 101/146 [00:25<00:11,  3.98it/s]

100 3.18377423286438


100%|██████████| 146/146 [00:36<00:00,  4.11it/s]
 35%|███▍      | 51/146 [00:13<00:23,  4.01it/s]

50 3.117255449295044


 69%|██████▉   | 101/146 [00:25<00:11,  3.94it/s]

100 3.183642625808716


100%|██████████| 146/146 [00:36<00:00,  4.17it/s]
 35%|███▍      | 51/146 [00:13<00:23,  4.09it/s]

50 3.117192268371582


 69%|██████▉   | 101/146 [00:25<00:11,  4.08it/s]

100 3.183511972427368


100%|██████████| 146/146 [00:37<00:00,  3.84it/s]
 35%|███▍      | 51/146 [00:14<00:25,  3.68it/s]

50 3.117131233215332


 69%|██████▉   | 101/146 [00:27<00:12,  3.73it/s]

100 3.183382987976074


100%|██████████| 146/146 [00:40<00:00,  3.44it/s]
 35%|███▍      | 51/146 [00:13<00:23,  3.98it/s]

50 3.117074728012085


 69%|██████▉   | 101/146 [00:26<00:10,  4.11it/s]

100 3.183255910873413


100%|██████████| 146/146 [00:38<00:00,  4.04it/s]
 35%|███▍      | 51/146 [00:13<00:26,  3.63it/s]

50 3.117020606994629


 69%|██████▉   | 101/146 [00:26<00:13,  3.46it/s]

100 3.183130979537964


100%|██████████| 146/146 [00:38<00:00,  4.03it/s]
 35%|███▍      | 51/146 [00:15<00:29,  3.22it/s]

50 3.1169705390930176


 69%|██████▉   | 101/146 [00:27<00:10,  4.12it/s]

100 3.1830079555511475


100%|██████████| 146/146 [00:38<00:00,  4.13it/s]
 35%|███▍      | 51/146 [00:12<00:23,  4.11it/s]

50 3.1169216632843018


 69%|██████▉   | 101/146 [00:25<00:11,  3.88it/s]

100 3.182884454727173


100%|██████████| 146/146 [00:37<00:00,  3.90it/s]
 35%|███▍      | 51/146 [00:14<00:24,  3.84it/s]

50 3.1168763637542725


 69%|██████▉   | 101/146 [00:26<00:10,  4.14it/s]

100 3.182762622833252


100%|██████████| 146/146 [00:38<00:00,  3.80it/s]
 35%|███▍      | 51/146 [00:14<00:25,  3.67it/s]

50 3.1168341636657715


 69%|██████▉   | 101/146 [00:27<00:11,  3.83it/s]

100 3.182644844055176


100%|██████████| 146/146 [00:39<00:00,  4.17it/s]
 35%|███▍      | 51/146 [00:13<00:25,  3.71it/s]

50 3.1167924404144287


 69%|██████▉   | 101/146 [00:27<00:12,  3.70it/s]

100 3.1825308799743652


100%|██████████| 146/146 [00:39<00:00,  4.03it/s]
 35%|███▍      | 51/146 [00:14<00:24,  3.88it/s]

50 3.116751194000244


 69%|██████▉   | 101/146 [00:26<00:11,  4.02it/s]

100 3.182420492172241


100%|██████████| 146/146 [00:38<00:00,  3.76it/s]
 35%|███▍      | 51/146 [00:12<00:22,  4.15it/s]

50 3.116711378097534


 69%|██████▉   | 101/146 [00:25<00:10,  4.15it/s]

100 3.182313919067383


100%|██████████| 146/146 [00:35<00:00,  4.19it/s]
 35%|███▍      | 51/146 [00:13<00:25,  3.66it/s]

50 3.116673469543457


 69%|██████▉   | 101/146 [00:27<00:12,  3.50it/s]

100 3.182209014892578


100%|██████████| 146/146 [00:38<00:00,  4.16it/s]
 35%|███▍      | 51/146 [00:13<00:23,  4.04it/s]

50 3.1166341304779053


 69%|██████▉   | 101/146 [00:26<00:11,  3.81it/s]

100 3.182103157043457


100%|██████████| 146/146 [00:38<00:00,  3.89it/s]

last loss:  3.1147189140319824





In [123]:
# Ran this multiple time, reducing learning rate by 10 each time
# Possible additional feaures / steps
# grandchild / sibling count / how deep from head
# more layers / dropout

In [119]:
def predict_on(loader, model, criterion):
    res = []
    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for (x, target) in tqdm(loader):
            # compute output
            x1 = x[:, 0].type(torch.LongTensor)
            x2 = x[:, 1].type(torch.LongTensor)

            output = model(x1, x2).data.cpu().numpy()
            classes = np.argmax(output, axis=1)
            res = np.append(res, classes)
    return res


In [120]:
len(X_test) // batch_size

20

In [121]:
result = predict_on(loader_test, model, criterion)
result = [int(x) for x in result]

100%|██████████| 21/21 [00:00<00:00, 11.19it/s]


In [122]:
print(metrics.classification_report(y_test, result, target_names=labeler.classes_))

                     precision    recall  f1-score   support

                acl       0.00      0.00      0.00       184
              advcl       0.00      0.00      0.00       109
           advcl:sp       0.00      0.00      0.00         4
          advcl:svc       0.00      0.00      0.00         6
             advmod       0.87      0.90      0.89       484
               amod       0.81      0.76      0.78       846
              appos       0.00      0.00      0.00        71
                aux       0.00      0.00      0.00        19
               case       0.98      1.00      0.99       945
                 cc       0.91      0.96      0.94       358
              ccomp       0.00      0.00      0.00        50
           compound       0.00      0.00      0.00        64
       compound:svc       0.20      0.43      0.27       475
               conj       0.00      0.00      0.00        57
           conj:svc       0.00      0.00      0.00        52
                cop    

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


Welp, got to some level (which partially could be explained by the fact that not all labels were present in training data) – so be it.

In [129]:
# Given we are using gold parsing trees, we can just do TP ratio on words
# Other possible thing would be doing LAS by sentences and averaging
def LAS_words(y_gold, y_pred):
    total = 0
    correct = 0
    for pair in zip(y_gold, y_pred):
        total += 1
        if pair[0] == pair[1]:
            correct += 1
    return correct / total
print(LAS_words(y_test, result))

0.657795776684987
