In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [27]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Dense, Flatten, Conv1D, concatenate, Activation
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from sklearn import metrics
from keras import Model, Sequential
import numpy as np
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
import bz2
import json
import dill

In [4]:
%aimport parser

In [5]:
from parser import Parser

In [6]:
class Metrics(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.f1s = []
        
    def on_epoch_end(self, batch, logs={}):
        probas = np.asarray(self.model.predict(self.validation_data[0]))
        targ = np.argmax(self.validation_data[1], axis=1)
        predict = np.argmax(probas, axis=1)
        self.f1s.append(metrics.f1_score(targ, predict, average="weighted"))
        return

metrs = Metrics()

In [9]:
vec_filename = "ubercorpus.lowercased.tokenized.300d.bz2"

In [10]:
def read_embeddings(filename=vec_filename, word_index=None):
    word_2_vec = {}
    with bz2.open(filename, "rt") as f:
        words, ndim = map(int, f.readline().strip().split())
        for line in f:
            values = line.split()
            word = values[0]
            if words:
                if word in word_index:
                    vec = np.asarray(values[1:], dtype=np.float32)
                    word_2_vec[word] = vec
            else:
                vec = np.asarray(values[1:], dtype=np.float32)
                word_2_vec[word] = vec
    return word_2_vec, ndim, words

In [16]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', "ROOT"),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [13]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()
trees = parse(data)

with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

In [14]:
tree = trees[0]
for node in tree:
    head = node["head"]
    print("{} {} <- {}".format(node["form"], node["id"], tree[head-1]["form"] if head>0 else "root"))

У 1 <- домі
домі 2 <- була
римського 3 <- патриція
патриція 4 <- домі
Руфіна 5 <- патриція
була 6 <- root
прегарна 7 <- фреска
фреска 8 <- була
, 9 <- зображення
зображення 10 <- фреска
Венери 11 <- зображення
та 12 <- Адоніса
Адоніса 13 <- Венери
. 14 <- була


In [7]:
def get_childs(word, tree):
    return [w for w in tree if w["head"]==word["id"]]

In [8]:
def get_heights(tree, res=None, word=None, k=0):
    if res is None:
        res = {0: 0}
        k = 1
    if word is None:
        word = [w for w in tree if not w["head"]][0]
        res[word["id"]] = k
    for child in get_childs(word, tree):
        res[child["id"]] = k + 1
        get_heights(tree, res, child, k+1)
    return res

In [9]:
def build_vocabulary(trees, form="form"):
    word_index = {}
    pos_index = {}
    label_index = {}
    labels = []
    for tree in trees:
        for word in tree:
            deprel = word["deprel"]
            word_id = len(word_index)+1
            pos_id = len(pos_index)+1
            word_t = word[form].lower()
            word_pos = word["upostag"]
            word_index[word_t] = word_index.get(word_t, word_id)
            pos_index[word_pos] = pos_index.get(word_pos, pos_id)
            if deprel not in label_index:
                label_id = len(label_index)
                label_index[deprel] = label_id
            else:
                label_id = label_index.get(deprel)
            labels.append(label_id)
    word_index[ROOT["form"]] = len(word_index)+1
    pos_index[ROOT["upostag"]] = len(pos_index)+1
    return word_index, pos_index, label_index, labels

In [10]:
def build_features(tree, word_index, pos_index, form="form"):
    label_index = {}
    records = []
    labels = []
    heights = get_heights(tree)
    for word in tree:
        deprel = word["deprel"]
        word_t = word[form].lower()
        head = tree[word["head"]-1] if word["head"] else ROOT
        head_t = head[form].lower() if word["head"] else "ROOT"
        word_pos = word["upostag"]
        head_pos = head["upostag"]

        dist = word["id"] - head["id"]
        height_diff = heights.get(word["id"], 0) - heights.get(head["id"], 0)
        n_l_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]<word["id"]])
        n_r_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]>word["id"]])
        n_l_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]<head["id"]])
        n_r_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]>head["id"]])

        # children
        w_children = [w for w in tree if w["head"]==word["id"]]
        if w_children:
            lm = min(w_children, key=lambda x: x["id"])
            rm = max(w_children, key=lambda x: x["id"])
            w_lr = [word_index.get(lm[form].lower()), word_index.get(rm[form].lower())]
            pos_lr = [pos_index.get(lm["upostag"]), pos_index.get(rm["upostag"])]
        else:
            w_lr = [0, 0]
            pos_lr = [0, 0]
               
        # construct final feature vector
        num_features = [dist, height_diff, heights.get(word["id"], 0), 
                        n_l_child_word, n_r_child_word, 
                        n_r_child_head, n_l_child_head
                       ]
        words = [word_index.get(word_t), word_index.get(head_t), *w_lr]
        tags = [pos_index.get(word_pos), pos_index.get(head_pos), *pos_lr]
        records.append((*words, *tags, *num_features))
    return records, len(words), len(tags), len(num_features)

In [11]:
def process_data(trees, word_index, pos_index, form="form"):
    records = []
    for tree in trees:
        feats, n_w, n_t, n_f = build_features(tree, word_index, pos_index, form)
        records.extend(feats)
    return records, n_w, n_t, n_f

In [19]:
n_train = np.sum([len(tree) for tree in trees])

In [17]:
word_index, pos_index, label_index, labels = build_vocabulary(trees+test_trees)
labels = to_categorical(np.asarray(labels))

In [18]:
idx_2_label = {v:k for k,v in label_index.items()}

In [22]:
records, n_words, n_tags, n_feat = process_data(trees+test_trees, word_index, pos_index)
X = np.asarray(records)

In [23]:
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = labels[:n_train], labels[n_train:]

In [24]:
word_2_vec, ndim, _ = read_embeddings(word_index=word_index)

In [25]:
DEFAULT_VEC = np.zeros(ndim, np.float32)

In [40]:
embedding_matrix = np.zeros((len(word_index)+1, ndim))
for word, i in word_index.items():
    embedding_matrix[i] = word_2_vec.get(word, DEFAULT_VEC)

In [48]:
word_embedding_layer = Embedding(len(word_index)+1,
                            ndim,
                            weights=[embedding_matrix],
                            input_length=n_words,
                            trainable=0
                           )

In [49]:
pos_embedding_layer = Embedding(len(pos_index)+1,
                                ndim,
                                input_length=n_tags,
                                trainable=1
                               )

In [60]:
word_sequence_input = Input(shape=(n_words,), dtype='int32')
word_embedded_sequences = word_embedding_layer(word_sequence_input)

In [61]:
pos_sequence_input = Input(shape=(n_tags,), dtype='int32')
pos_embedded_sequences = pos_embedding_layer(pos_sequence_input)

In [62]:
features = Input(shape=(n_feat,))

In [63]:
left = Flatten()(word_embedded_sequences)
right = Flatten()(pos_embedded_sequences)
x = concatenate(inputs=[left, right, features])
x = Dense(200)(x)
x = Dense(100, activation='relu')(x)
preds = Dense(len(label_index), activation='softmax')(x)

In [64]:
model = Model(inputs=[word_sequence_input, pos_sequence_input, features], outputs=preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [65]:
model.fit([X_train[:, :n_words], X_train[:, n_words:n_words+n_tags], X_train[:, n_words+n_tags:]], y_train, 
          validation_data=([X_test[:, :n_words], X_test[:, n_words:n_words+n_tags], X_test[:, n_words+n_tags:]], y_test), 
          epochs=6, 
          batch_size=128, 
          verbose=1)

Train on 75098 samples, validate on 14939 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fdc6cc23320>

##### Save the FNN model

In [66]:
model_name = "FNN_deprel.h5"
model.save(filepath=model_name)

In [67]:
del model

##### Load vectorizer and parser model

In [19]:
with open("parser.dill", "rb") as f:
    clf = dill.load(f)

In [20]:
with open("vectorizer.dill", "rb") as f:
    vectorizer = dill.load(f)

In [21]:
model = load_model("FNN_deprel.h5")

In [22]:
parser = Parser()

In [23]:
def construct_tree(tree, predicted):
    output = []
    for child, head in predicted:
        elem = tree[child-1].copy()
        elem["head"] = head
        output.append(elem)
    return output

In [24]:
def add_arc_to_root(tree, predicted):
    children = set([child for child, parent in predicted])
    words = set([w["id"] for w in tree])
    for word in words-children:
        predicted.append((word, 0))
    return predicted

In [25]:
def LUAS(trees, oracle=None, vectorizer=None):
    total, tpL, tpU, failed = 0, 0, 0, 0
    for tree in trees:
        golden = [(node["id"], node["head"], node["deprel"]) for node in tree]
        try:
            _, _, predicted = parser.parse(tree, oracle=oracle, vectorizer=vectorizer)
            predicted = add_arc_to_root(tree, predicted)
            predicted_tree = construct_tree(tree, predicted)
            feats, n_w, n_t, n_f = build_features(predicted_tree, word_index, pos_index, "form")
            feats = np.asarray(feats)
            preds = np.argmax(model.predict([feats[:, :n_w], 
                                             feats[:, n_w:n_w+n_t], 
                                             feats[:, n_w+n_t:]]), axis=1)
            pred_labels = (idx_2_label[p] for p in preds)
            predicted_labeled = [(child, head, label) for (child, head), label in zip(predicted, pred_labels)]
            total += len(golden)
            tpL += len(set(golden).intersection(set(predicted_labeled)))
            tpU += len(set([(c,h) for c,h,_ in golden]).intersection(predicted))
        except:
            failed += 1
    return total, tpL, tpU, failed

In [26]:
total, tpL, tpU, failed = LUAS(test_trees, clf, vectorizer)
print("Failed:", failed)
print("Total:", total)
print("Correctly defined (unlabeled):", tpU)
print("Correctly defined (labeled):", tpL)
print("UAS:", round(tpU / total, 3))
print("LAS:", round(tpL / total, 3))

Failed: 4
Total: 14781
Correctly defined (unlabeled): 11560
Correctly defined (labeled): 9705
UAS: 0.782
LAS: 0.657
