In [1]:
from conllu import parse, parse_tree
from pathlib import Path
import os
from collections import OrderedDict
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
import xgboost as xgb
import numpy as np
import dill
import pymorphy2
from tokenize_uk.tokenize_uk import tokenize_words

In [5]:
def calc_metrics(y_test, pred, proba=None, labels=None, print_=True, mode="weighted"):
    output = {}
    if proba is not None:
        roc_auc = metrics.roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = metrics.recall_score(y_test, pred, average=mode)
    output["Precision"] = metrics.precision_score(y_test, pred, average=mode)
    output["F1"] = metrics.f1_score(y_test, pred, average=mode)
    output["Accuracy"] = metrics.accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + str(el) for el in index]
    else:
        columns = None
        index = None
    conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, pred, labels=labels), 
                               columns=columns, index=index)
    report = metrics.classification_report(y_true=y_test, y_pred=pred, labels=labels)
    if print_:
        for key, value in output.items():
            print(f"{key}: {value:0.3f}")
        print("\nConfusion matrix:")
        print(conf_matrix)
        print("\nReport:")
        print(report)
    return output, report, conf_matrix

In [2]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', "ROOT"),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [3]:
data_dir = Path.home() / "repos/UD_Ukrainian-IU"

In [4]:
with list(data_dir.glob("*train*"))[0].open() as f:
    data = f.read()

In [7]:
trees = parse(data)

In [9]:
tree = trees[0]
tree[-1]

OrderedDict([('id', 14),
             ('form', '.'),
             ('lemma', '.'),
             ('upostag', 'PUNCT'),
             ('xpostag', 'U'),
             ('feats', None),
             ('head', 6),
             ('deprel', 'punct'),
             ('deps', None),
             ('misc', OrderedDict([('Id', '000g')]))])

In [78]:
for node in tree:
    head = node["head"]
    print("{} <- {}".format(node["form"], tree[head-1]["form"] if head>0 else "root"))

У <- домі
домі <- була
римського <- патриція
патриція <- домі
Руфіна <- патриція
була <- root
прегарна <- фреска
фреска <- була
, <- зображення
зображення <- фреска
Венери <- зображення
та <- Адоніса
Адоніса <- Венери
. <- була


In [11]:
def get_parse_context(word, deps, data):
    if not word or word == -1:
        return 0, "", ""
    deps = deps[word["id"]]
    num = len(deps)
    if not num:
        return num, "", ""
    elif num==1:
        return num, data[deps[-1]-1], ""
    else:
        return num, data[deps[-1]-1], data[deps[-1]-1]

In [12]:
def extract_features(stack, queue, tree, parse):
    features = {}
    stack_depth = len(stack)
    s0 = stack[-1] if stack_depth else ""
    q0 = queue[0] if queue else ""
    
    # Features for stack
    if stack:
        features["s0-form"] = s0["form"]
        features["s0-tag"] = s0["upostag"]
        features["s0-lemma"] = s0["lemma"]
        features["s0-word-tag"] = s0["form"] + s0["upostag"]
        if s0.get("feats"):
            for k, v in s0["feats"].items():
                features[f"s0-{k}"] = v
    if stack_depth > 1:
        features["s1-tag"] = stack[-2]["upostag"]
        features["s1-word-tag"] = stack[-2]["form"] + stack[-2]["upostag"]
    
    # Features for queue
    if queue:
        features["q0-form"] = q0["form"]
        features["q0-tag"] = q0["upostag"]
        features["q0-lemma"] = q0["lemma"]
        features["q0-word-tag"] = q0["form"] + q0["upostag"]
        if q0.get("feats"):
            for k, v in q0["feats"].items():
                features[f"q0-{k}"] = v 
    if len(queue) > 1:
        features["q1-form"] = queue[1]["form"]
        features["q1-tag"] = queue[1]["upostag"]
        features["q1-word-tag"] = queue[1]["form"] + queue[1]["upostag"]
        features["q0q1"] = q0["form"] + queue[1]["form"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
        #features["q2-word-tag"] = queue[2]["form"] + queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
        
    if queue and stack:
        Ds0q0 = q0["id"] - s0["id"]
        features["distance"] = Ds0q0
        features["q0-dist"] = q0["form"] + "-{}".format(Ds0q0)
        features["s0-dist"] = s0["form"] + "-{}".format(Ds0q0)
        features["s0q0-dist"] = s0["lemma"] + q0["lemma"] + "-{}".format(Ds0q0)
        features["s0-tag-dist"] = s0["upostag"] + "-{}".format(Ds0q0)
        features["q0-tag-dist"] = q0["upostag"] + "-{}".format(Ds0q0)
        features["s0q0-tag-dist"] = s0["upostag"] + q0["upostag"] + "-{}".format(Ds0q0)
        # Add bigrams
        features["s0q0"] = s0["form"] + q0["form"]
        features["s0q0-tag"] = s0["upostag"] + q0["upostag"]
        features["q0_q0-tag_s0"] = q0["form"] + q0["upostag"] + s0["form"]
        features["q0_q0-tag_s0-tag"] = q0["form"] + q0["upostag"] + s0["upostag"]
        features["s0_s0-tag_q0"] = s0["form"] + s0["upostag"] + q0["form"]
        features["s0_s0-tag_q0-tag"] = s0["form"] + s0["upostag"] + q0["upostag"]
        features["s0_s0-tag_q0_q0-tag"] = s0["form"] + s0["upostag"] + q0["form"] + q0["upostag"]
        
        
    
    # Left two child for top stack
    Ns0l, s0l1, s0l2 = get_parse_context(s0, parse.lefts, tree) 
    if s0l1:
        features["s0l1"] = s0l1["form"]
        features["s0l1-tag"] = s0l1["upostag"]   
    if s0l2:
        features["s0l2"] = s0l2["form"]
        features["s0l2-tag"] = s0l2["upostag"]
    
    # Right two child for top stack
    Ns0r, s0r1, s0r2 = get_parse_context(s0, parse.rights, tree)
    if s0r1:
        features["s0r1"] = s0r1["form"]
        features["s0r1-tag"] = s0r1["upostag"] 
    if s0r2:
        features["s0r2"] = s0r2["form"]
        features["s0r2-tag"] = s0r2["upostag"]
    
    # Left two child for top queue
    Nq0l, q0l1, q0l2 = get_parse_context(q0, parse.lefts, tree)
    if q0l1:
        features["q0l1"] = q0l1["form"]
        features["q0l1-tag"] = q0l1["upostag"]  
    if q0l2:
        features["q0l2"] = q0l2["form"]
        features["q0l2-tag"] = q0l2["upostag"]
    
    if stack:
        features["s0l-N"] = s0["form"] + f"-{Ns0l}"
        features["s0r-N"] = s0["form"] + f"-{Ns0r}"
        features["s0l-tag-N"] = s0["upostag"] + f"-{Ns0l}"
        features["s0r-tag-N"] = s0["upostag"] + f"-{Ns0r}"
    if queue:
        features["q0l-N"] = q0["form"] + f"-{Nq0l}"
        features["q0l-tag-N"] = q0["upostag"] + f"-{Nq0l}"
    return features

In [13]:
class Parse(object):
    
    def __init__(self, n):
        self.n = n
        self.relations = []
        self.lefts = []
        self.rights = []
        # we need n+1 coz examples in the training data are indexed from 1
        for k in range(n+1):
            self.lefts.append([])
            self.rights.append([])
    
    def add_relation(self, child, head):
        self.relations.append((child, head))
        if child < head:
            self.lefts[head].append(child)
        else:
            self.rights[head].append(child)

In [84]:
class Parser(object):
    
    def __init__(self):
        pass
    
    def get_action(self, stack, q, parse):
        if stack and not q:
            return "reduce"
        if stack[-1]["head"] == q[0]["id"]:
            return "left"
        elif q[0]["head"] == stack[-1]["id"]:
            return "right"
        elif (stack[-1]["head"] in [parent for _, parent in parse.relations] 
              and q[0]["head"] < stack[-1]["id"]):
            return "reduce"
        else:
            return "shift" 
        
    def parse(self, tree, oracle=None, vectorizer=None, log=False):
        q = tree.copy()
        parse = Parse(len(q))
        stack = [ROOT]
        labels = []
        features = []
        while q or stack:
            if log:
                print("Stack:", [el["form"] for el in stack])
                print("Q:", [el["form"] for el in q])
            feature_set = extract_features(stack, q, tree, parse)
            
            if oracle is not None:
                v_features = vectorizer.transform(feature_set)
                action = oracle.predict(v_features)[0]
            else:
                action = self.get_action(stack or None, q or None, parse)
            
            if action == "left":
                parse.add_relation(stack[-1]["id"], q[0]["id"])
                stack.pop()
            elif action == "right":
                parse.add_relation(q[0]["id"], stack[-1]["id"])
                stack.append(q.pop(0))
            elif action == "reduce":
                stack.pop()
            elif action == "shift":
                stack.append(q.pop(0))              
            labels.append(action)
            features.append(feature_set)
        return labels, features, parse.relations

In [85]:
parser = Parser()
labels, features, _ = parser.parse(tree)
print(len(labels), len(features))

29 29


In [16]:
def get_data(trees, parser):
    o_labels = []
    o_features = []
    for tree in trees:
        labels, features, _ = parser.parse(tree)
        o_labels.extend(labels)
        o_features.extend(features)
    return o_labels, o_features

##### Prepare train / test data

In [17]:
with list(data_dir.glob("*test*"))[0].open() as f:
    test_data = f.read()
test_trees = parse(test_data)

In [18]:
y_train, features_train = get_data(trees, parser)
print(len(features_train), len(y_train))

154709 154709


In [19]:
y_test, features_test = get_data(test_trees, parser)
print(len(features_test), len(y_test))

30661 30661


##### Vectorize features

In [23]:
vectorizer = DictVectorizer(sparse=True)
v_train = vectorizer.fit_transform(features_train)
v_test = vectorizer.transform(features_test)

#### Train models

In [21]:
clf = LogisticRegression(random_state=25)
clf.fit(v_train, y_train)
y_pred = clf.predict(v_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=25, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
output, report, conf_matrix = calc_metrics(y_test, y_pred, labels=clf.classes_)

Recall: 0.897
Precision: 0.897
F1: 0.896
Accuracy: 0.897

Confusion matrix:
        pred_left  pred_reduce  pred_right  pred_shift
left         6987          141          52         186
reduce        367         6972         826         191
right          63          586        6357         211
shift         196          132         214        7180

Report:
             precision    recall  f1-score   support

       left       0.92      0.95      0.93      7366
     reduce       0.89      0.83      0.86      8356
      right       0.85      0.88      0.87      7217
      shift       0.92      0.93      0.93      7722

avg / total       0.90      0.90      0.90     30661



##### Add UAS calculation

In [175]:
def UAS(trees, oracle=None, vectorizer=None):
    total, tp, failed = 0, 0, 0
    for tree in trees:
        try:
            golden = [(node["id"], node["head"]) for node in tree]
            _, _, predicted = parser.parse(tree, oracle=oracle, vectorizer=vectorizer)
            total += len(golden)
            tp += len(set(golden).intersection(set(predicted))) 
        except:
            failed += 1
    return total, tp, failed

In [176]:
total, tp, failed = UAS(test_trees, clf, vectorizer)
print("Failed:", failed)
print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp / total, 2))

Failed: 4
Total: 14781
Correctly defined: 11501
UAS: 0.78


##### Save the model

In [26]:
with open("model.dill", "wb+") as f:
    dill.dump(clf, f)

#### Parse sentence using trained parser

In [69]:
mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

CONJ_COORD = ["а", "або", "але", "ані", "все", "все-таки", "втім", "ж", "же",
              "зате", "і", "й", "ніже", "однак", "одначе", "прецінь", "проте",
              "та", "так", "також", "усе", "усе-таки", "утім", "чи"]

DET = ['інакший', 'його', 'тамтой', 'чий', 'їх', 'інш.', 'деякий', 'ввесь', 'ваш', 
     'ніякий', 'весь', 'інший', 'чийсь', 'жадний', 'другий', 'кожний', 
     'такий', 'оцей', 'скілька', 'цей', 'жодний', 'все', 'кілька', 'увесь', 
     'кожній', 'те', 'сей', 'ін.', 'отакий', 'котрий', 'усякий', 'самий', 
     'наш', 'усілякий', 'будь-який', 'сам', 'свій', 'всілякий', 'всенький', 'її', 
     'всякий', 'отой', 'небагато', 'який', 'їхній', 'той', 'якийсь', 'ин.', 'котрийсь', 
     'твій', 'мій', 'це', 'цей']

def normalize_pos(word):
    if word.normal_form in DET:
            return "DET"
    if word.tag._POS == "PNCT":
        return "PUNCT"
    if word.tag.POS == "CONJ":
        if word.word in CONJ_COORD:
            return "CCONJ"
        else:
            return "SCONJ"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)

In [87]:
def create_example(sentence):
    output = []
    for k,word in enumerate(sentence):
        p = morph.parse(word)[0]
        word_dict = {"id": k+1, 
                     "upostag": normalize_pos(p) or "", 
                     "form": word, 
                     "lemma": p.normal_form or word}
        output.append(word_dict)
    return output

In [160]:
# def print_deprel(relations, sentence):
#     for child, head in relations:
#         print("{} <- {}".format(sentence[child-1]["form"], sentence[head-1]["form"] if head > 0 else "root"))

In [161]:
def print_deprel(relations, sentence):
    for child, head in relations:
        sentence[child-1]["head"] = head
    for word in sentence:
        head = word.get('head', 0)
        print("{} <- {}".format(word["form"], sentence[head-1]["form"] if head > 0 else "root"))

##### Load the model

In [None]:
with open("model.dill", "rb") as f:
    clf = dill.load(f)

In [172]:
# Throw away punctuations inside sentences
sents = ["Ці речі були дуже прекрасні але я не міг зосередитися на них через чудовий захід сонця.",
         "От вони нагодували його й напоїли і під крильця насипали пшона.",
         "Поселення на острові існували ще за часів енеоліту і бронзового віку про це свідчать знайдені археологами предмети побуту знаряддя праці рештки посуду та прикраси."
        ]
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [173]:
for sent in sents:
    words = tokenize_words(sent)
    example = create_example(words)
    _, _, relations = parser.parse(example, oracle=clf, vectorizer=vectorizer)#, log=True)
    print("Sentence: {}\n".format(sent))
    print_deprel(relations, example)
    print()

Sentence: Ці речі були дуже прекрасні але я не міг зосередитися на них через чудовий захід сонця.

Ці <- речі
речі <- були
були <- root
дуже <- прекрасні
прекрасні <- root
але <- міг
я <- міг
не <- міг
міг <- були
зосередитися <- міг
на <- root
них <- зосередитися
через <- захід
чудовий <- захід
захід <- зосередитися
сонця <- захід
. <- були

Sentence: От вони нагодували його й напоїли і під крильця насипали пшона.

От <- нагодували
вони <- нагодували
нагодували <- root
його <- нагодували
й <- напоїли
напоїли <- нагодували
і <- насипали
під <- крильця
крильця <- насипали
насипали <- нагодували
пшона <- насипали
. <- нагодували

Sentence: Поселення на острові існували ще за часів енеоліту і бронзового віку про це свідчать знайдені археологами предмети побуту знаряддя праці рештки посуду та прикраси.

Поселення <- існували
на <- root
острові <- Поселення
існували <- root
ще <- root
за <- root
часів <- існували
енеоліту <- часів
і <- свідчать
бронзового <- віку
віку <- свідчать
про <- roo

#### Try XGBoost

In [22]:
def encode_list(l):
    return np.unique(l, return_inverse=True)

In [28]:
params = {}
params['learning_rate'] = 0.1
params['n_estimators'] = 1000
params['max_depth'] = 5
params['min_child_weight'] = 100
params['gamma'] = 0
params['subsample'] = 0.8
params['colsample_bytree'] = 0.8
params['objective'] = "multi:softmax" 
params['seed'] = 27
params['n_jobs'] = -1
params["eval_metric"] = ["merror", "mlogloss"]
params["early_stopping_rounds"] = 50
params["num_class"] = len(np.unique(y_train))

In [24]:
encode_map_train, y_train_num = encode_list(y_train)
encode_map_test, y_test_num = encode_list(y_test)
dtrain = xgb.DMatrix(v_train, y_train_num)
dtest = xgb.DMatrix(v_test, y_test_num)
eval_set = [(dtrain, "train"), (dtest, "eval")]

In [29]:
model = xgb.train(dtrain=dtrain, num_boost_round=params.get("n_estimators"), 
                  early_stopping_rounds=params.get("early_stopping_rounds"), 
                  params=params, evals=eval_set, verbose_eval=50)

[0]	train-merror:0.204875	train-mlogloss:1.27807	eval-merror:0.227096	eval-mlogloss:1.28283
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 50 rounds.
[50]	train-merror:0.15153	train-mlogloss:0.445531	eval-merror:0.164737	eval-mlogloss:0.475694
[100]	train-merror:0.133173	train-mlogloss:0.368257	eval-merror:0.146114	eval-mlogloss:0.394882
[150]	train-merror:0.124964	train-mlogloss:0.336257	eval-merror:0.135449	eval-mlogloss:0.362005
[200]	train-merror:0.116419	train-mlogloss:0.315891	eval-merror:0.127784	eval-mlogloss:0.341344
[250]	train-merror:0.11161	train-mlogloss:0.302556	eval-merror:0.123055	eval-mlogloss:0.32783
[300]	train-merror:0.107214	train-mlogloss:0.29202	eval-merror:0.118848	eval-mlogloss:0.317335
[350]	train-merror:0.10353	train-mlogloss:0.28343	eval-merror:0.114445	eval-mlogloss:0.308679
[400]	train-merror:0.100304	train-mlogloss:0.275616	eval-merror:0.111118	eval-mlogloss:0.301

In [39]:
pred_xgb = model.predict(dtest)
y_pred_xgb = encode_map_test[pred_xgb.astype(np.int)]
xgb_metrics = calc_metrics(y_test, y_pred_xgb, labels=encode_map_test)

Recall: 0.901
Precision: 0.900
F1: 0.900
Accuracy: 0.901

Confusion matrix:
        pred_left  pred_reduce  pred_right  pred_shift
left         6947          162          51         206
reduce        325         7240         600         191
right          64          596        6252         305
shift         231          149         169        7173

Report:
             precision    recall  f1-score   support

       left       0.92      0.94      0.93      7366
     reduce       0.89      0.87      0.88      8356
      right       0.88      0.87      0.88      7217
      shift       0.91      0.93      0.92      7722

avg / total       0.90      0.90      0.90     30661



In [63]:
class SKBooster(object):

    def __init__(self, booster=xgb.Booster(), params={}, verbose=False, encode_map=None):
        self.booster = booster
        self.params = params
        self.verbose = verbose
        self.encode_map = encode_map

    def _predict(self, X, feature_names=None):
        X = xgb.DMatrix(X, feature_names=feature_names)
        return self.booster.predict(X)

    def predict(self, X, feature_names=None):
        pred = self._predict(X, feature_names=feature_names)
        y_pred = encode_map_test[pred.astype(np.int)]
        return y_pred

    def fit(self, X, y, deval=None):
        dtrain = xgb.DMatrix(X, y)
        evals = [(dtrain, 'train')]
        if deval is not None:
            evals.append((deval, 'eval'))
        self.booster = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.params.get("n_estimators"),
                                 early_stopping_rounds=self.params.get("early_stopping_rounds"),
                                 evals=evals,
                                 verbose_eval=self.verbose)

    def set_params(self, **kwargs):
        self.params.update(kwargs)

In [74]:
skbooster = SKBooster(booster=model, encode_map=encode_map_test)
skbooster.predict(v_test[0])

array(['shift'], dtype='<U6')

In [None]:
total, tp, failed = UAS(test_trees, skbooster, vectorizer)
print("Failed:", failed)
print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp / total, 3))