# Ukrainian Dependency Parser

In [1]:
actions = ['shift', 'right-arc', 'left-arc', 'reduce']
elemnt = ('parent', 'child')

In [2]:
from collections import OrderedDict
from conllu import parse
from enum import Enum


def get_data(path):
    with open(path, "r") as f:
        data = f.read()

    trees = parse(data)
    return trees

# debug mode
trees = get_data('./corpus/uk_iu-ud-train.conllu')

for i, tree in enumerate(trees):
    for node in tree:
        head = node["head"]
        try:
            print("{} <-- {}".format(node["form"],
                                     tree[head - 1]["form"]
                                     if head > 0 else "root"))
        except TypeError:
            pass
    
    if i > 1:
        break



У <-- домі
домі <-- була
римського <-- патриція
патриція <-- домі
Руфіна <-- патриція
була <-- root
прегарна <-- фреска
фреска <-- була
, <-- зображення
зображення <-- фреска
Венери <-- зображення
та <-- Адоніса
Адоніса <-- Венери
. <-- була
Якось <-- зібралися
зібралися <-- root
у <-- нього
нього <-- зібралися
, <-- ховаючися
ховаючися <-- зібралися
від <-- переслідувань
переслідувань <-- ховаючися
, <-- ховаючися
одновірці <-- зібралися
дружини <-- одновірці
– <-- християнки
християнки <-- дружини
. <-- зібралися
Й <-- узялися
одразу <-- узялися
ж <-- одразу
узялися <-- root
замазувати <-- узялися
стіну <-- замазувати
, <-- певні
певні <-- узялися
свого <-- права
права <-- певні
негайно <-- знищити
знищити <-- права
гріховне <-- мальовидло
, <-- погляд
як <-- погляд
на <-- погляд
їх <-- погляд
погляд <-- гріховне
, <-- погляд
мальовидло <-- знищити
. <-- узялися


In [3]:
trees[1][1]

OrderedDict([('id', 2),
             ('form', 'зібралися'),
             ('lemma', 'зібратися'),
             ('upostag', 'VERB'),
             ('xpostag', 'Vmeis-p'),
             ('feats',
              OrderedDict([('Aspect', 'Perf'),
                           ('Mood', 'Ind'),
                           ('Number', 'Plur'),
                           ('Tense', 'Past'),
                           ('VerbForm', 'Fin')])),
             ('head', 0),
             ('deprel', 'root'),
             ('deps', [('root', 0)]),
             ('misc',
              OrderedDict([('Id', '000j'),
                           ('LTranslit', 'zibratyśа'),
                           ('Translit', 'zibralyśа')]))])

In [4]:
print(trees[1][2])

OrderedDict([('id', 3), ('form', 'у'), ('lemma', 'у'), ('upostag', 'ADP'), ('xpostag', 'Spsg'), ('feats', OrderedDict([('Case', 'Gen')])), ('head', 4), ('deprel', 'case'), ('deps', [('case', 4)]), ('misc', OrderedDict([('Id', '000k'), ('LTranslit', 'u'), ('Translit', 'u')]))])


In [5]:
from pprint import pprint as pp

def shift(stack, queue):
    stack.append(queue.pop(0))
    return stack, queue

def right_arc(stack, queue, dep_arcs):
    dep_arcs.append((stack[-1]['id'], queue[0]['id']))
    stack, queue = shift(stack, queue)
    return stack, queue, dep_arcs

def left_arc(stack, queue, dep_arcs):
    dep_arcs.append((queue[0]['id'], stack[-1]['id']))
    stack.pop(-1)
    return stack, queue, dep_arcs

def reduce(stack):
    stack.pop()
    return stack

def oracle_det(stack, queue, dep_arcs):
    """Deterministic oracle for trainiang. Requires a fully annotated tree."""
    global ROOT
    
    if stack[-1] and not queue[0]:
        return 'reduce'
    elif stack[-1]['head'] == queue[0]['id']:
        return 'left_arc'
    elif queue[0]['head'] == stack[-1]['id']:
        return 'right_arc'
    elif stack[-1]["id"] in [i[0] for i in dep_arcs] and \
         (queue[0]["head"] < stack[-1]["id"] or \
         [s for s in stack if s["head"] == queue[0]["id"]]):
        return 'reduce'    
    else:
        return 'shift'

def feature_extract(stack, queue, dep_arcs):
    features = {}
    
    # stk_0: form, lemma, postag, feats
    features['stk_0_form'] = stack[-1]['form']
    features['stk_0_lemma'] = stack[-1]['lemma']
    features['stk_0_postag'] = stack[-1]['upostag']
    
    if stack[-1]['feats'] != None:
        for feat in stack[-1]['feats'].keys():
            features['stk_0_'+feat] = stack[-1]['feats'][feat]
    
    # queue_0: form, lemma, postag, feats
    features['que_0_form'] = queue[0]['form']
    features['que_0_lemma'] = queue[0]['lemma']
    features['que_0_postag'] = queue[0]['upostag']
    
    if queue[0]['feats'] != None:
        for feat in queue[0]['feats'].keys():
            features['que_0_'+feat] = queue[0]['feats'][feat]
    
    # queue_1: form, postag
    try:
        features['que_1_form'] = queue[1]['form']
        features['que_1_postag'] = queue[1]['upostag']
    except IndexError:
        pass

    # queue_2: postag
    try:
        features['que_2_postag'] = queue[2]['upostag']
    except IndexError:
        pass
    
    # queue_3: postag
    try:
        features['que_3_postag'] = queue[3]['upostag']
    except IndexError:
        pass
    
    return features
      
    
    
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def dep_parse(tree, ret='xy', orcl='det'):
    """Parse dependencies for one sentence (tree)"""
    
    global ROOT
    stack = [ROOT]
    queue = tree[:]
    dep_arcs = []
    
    x, y = [], []
    
    while len(stack) > 0 and len(queue) > 0:
        
        features = feature_extract(stack, queue, dep_arcs)        
        
        try:
            if orcl == 'det':
                action = oracle_det(stack, queue, dep_arcs)
            elif orcl == 'ml':
                action = oracle_ml(features)
        except TypeError:
            print(stack)
            print(queue)
            break
        
        x.append(features)
        y.append(action)
        
        if action == 'reduce':
            stack = reduce(stack)
        elif action == 'left_arc':
            stack, queue, dep_arcs = left_arc(stack, queue, dep_arcs)
        elif action == 'right_arc':
            stack, queue, dep_arcs = right_arc(stack, queue, dep_arcs)
        elif action == 'shift':
            stack, queue = shift(stack, queue)

    if ret == 'xy':
        return x, y    
    elif ret == 'arcs':
        return dep_arcs


def filter_trees(trees): 
    """Delete nodes from a tree where id is not an integer"""
    return [[token for token in tree if type(token['id']) == int] for tree in trees]


def prepare_data(path):

    X, Y = [], []
    trees = filter_trees(get_data(path))
    
    for tree in trees:
        x, y = dep_parse(tree)
        X.extend(x)
        Y.extend(y)

    assert len(X) == len(Y)
    
    return X, Y
    
train_path = "./corpus/uk_iu-ud-train.conllu"
test_path = "./corpus/uk_iu-ud-test.conllu"

X_train, Y_train = prepare_data(train_path)
X_test, Y_test = prepare_data(test_path)


In [6]:
# Vectorize features

from sklearn.feature_extraction import DictVectorizer

def vectorize(X_train, X_test):
    
    print('\nVectorizing...')
    v = DictVectorizer(sparse=True)
    
    vectorizer = v.fit(X_train)
    v_train = vectorizer.transform(X_train)
    v_test = vectorizer.transform(X_test)
    
    return v_train, v_test, vectorizer

X_train, X_test, vectorizer = vectorize(X_train, X_test)


Vectorizing...


In [7]:
# Try a different classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rfc = RandomForestClassifier(n_estimators=20, criterion='entropy', max_depth=None, n_jobs=-1, verbose=False)
rfc.fit(X_train, Y_train)

predicted = rfc.predict(X_test)
print(classification_report(Y_test, predicted))

              precision    recall  f1-score   support

    left_arc       0.85      0.93      0.89      7346
      reduce       0.62      0.42      0.50      2552
   right_arc       0.80      0.82      0.81      5935
       shift       0.86      0.86      0.86     10336

   micro avg       0.83      0.83      0.83     26169
   macro avg       0.78      0.76      0.76     26169
weighted avg       0.82      0.83      0.82     26169



# Trying our Parser on New Text

* First of all, we need to transform our text into a `conll`-like format.
* To do that, we shall tokenize it with `tokenize_uk`;
* Then, for each token, we will extract the necessary features using `pymorphy`.

In [9]:
from collections import OrderedDict

from tokenize_uk import tokenize_words, tokenize_sents
from pymorphy2 import MorphAnalyzer

from pprint import pprint as pp

# Just take a random paragraph from Kaydasheva Simya
text = """Яр в'ється гадюкою мiж крутими горами, мiж зеленими терасами; од яру на всi боки розбiглись, неначе гiлки дерева, глибокi рукави й поховались десь далеко в густих лiсах."""

def build_conll_trees(text):
    """
    :param text: str - any input text in Ukrainian.
    :rtype trees: list of dicts: {id, form, lemma, upostag, xpostag, feats, head, deprel, deps, misc}
    """

    DET = ['інакший', 'його', 'тамтой', 'чий', 'їх', 'інш.', 'деякий', 'ввесь', 'ваш', 
           'ніякий', 'весь', 'інший', 'чийсь', 'жадний', 'другий', 'кожний', 
           'такий', 'оцей', 'скілька', 'цей', 'жодний', 'все', 'кілька', 'увесь', 
           'кожній', 'те', 'сей', 'ін.', 'отакий', 'котрий', 'усякий', 'самий', 
           'наш', 'усілякий', 'будь-який', 'сам', 'свій', 'всілякий', 'всенький', 'її', 
           'всякий', 'отой', 'небагато', 'який', 'їхній', 'той', 'якийсь', 'ин.', 'котрийсь', 
           'твій', 'мій', 'це']

    PREP = ["до", "на"]

    mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
               "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
               "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

    def normalize_pos(word):
        if word.tag.POS == "CONJ":
            if "coord" in word.tag:
                return "CCONJ"
            else:
                return "SCONJ"
        elif "PNCT" in word.tag:
            return "PUNCT"
        elif word.normal_form in PREP:
            return "PREP"
        else:
            return mapping.get(word.tag.POS, word.tag.POS)
    
    morph = MorphAnalyzer(lang='uk')
    trees = []
    
    # Tokenize text into sentences
    sentences = tokenize_sents(text)
    for i, sentence in enumerate(sentences):
        tree = []
        
        # Tokenize sentences into words
        words = tokenize_words(sentence)
        for j, word in enumerate(words):
            
            # We assume the pymorphy does a decent job and 
            # take the top word from the rank
            token = morph.parse(word)[0]
            
            # Construct tree node:
            
            # Populate features dict
            feat_names = [
                'animacy', 'aspect', 'case', 'gender', 'involvement', 'mood', 'number',
                'person', 'tense', 'transitivity', 'voice'
            ]
            feats = OrderedDict({})
            for feat_name in feat_names:
                exec(f"""if token.tag.{feat_name}: feats['{feat_name.title()}'] = str(token.tag.{feat_name}).title()""")

            node = OrderedDict({
                'id': j,
                'form': word,
                'lemma': token.normal_form,
                'upostag': normalize_pos(token),
                'xpostag': '',
                'feats': feats,
                'head': '',
                'deprel': '',
                'deps': '',
                'misc': ''
            })
            
            tree.append(node)
        
        trees.append(tree)
    
    return trees
            
def oracle_ml(features):
    """A machine learning classifier for untagged sentences.
    :param features: dict - output of feature_extract(stack, queue, deps)
    :rtype action: str - an action for the dependency parser"""
    
    global vectorizer
    global rfc
    
    X = vectorizer.transform(features)
    Y = rfc.predict(X)
    
    return Y[0]
    
    
def parse_pipeline(text):
    """
    :param text: str - any input text in Ukrainian.
    :rtype dep_arcs: list of tuples: (parent_id, child_id) - list of arcs produced by the parser.
    """
    trees = build_conll_trees(text)
    arcs = []
    
    for tree in trees:
        arcs.extend(dep_parse(tree, ret='arcs', orcl='ml'))
        
    return arcs
    
arcs = parse_pipeline(text)


In [11]:
# Write results into a separate file

import json 

with open('output.json', 'w+') as f:
    f.write(json.dumps(arcs))
