In [57]:
from conllu import parse, parse_tree
from collections import OrderedDict
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import re, glob, pdb, pandas as pd

In [2]:
glob.glob('UD*/*')

['UD_Ukrainian-IU/eval.log',
 'UD_Ukrainian-IU/LICENSE.txt',
 'UD_Ukrainian-IU/stats.xml',
 'UD_Ukrainian-IU/uk_iu-ud-test.conllu',
 'UD_Ukrainian-IU/uk_iu-ud-dev.conllu',
 'UD_Ukrainian-IU/uk_iu-ud-train.conllu',
 'UD_Ukrainian-IU/README.md']

In [3]:
with open('UD_Ukrainian-IU/uk_iu-ud-train.conllu') as f:
    trees = re.sub(r" +", r"\t", f.read())
    trees = parse(trees)

## Features


In [4]:
def get_features(stack, queue, relations, tree):
    feature_dict = {}
    if len(queue) > 0:
        feature_dict['q0-form'] = queue[0]['form']
        feature_dict['q0-lemma'] = queue[0]['lemma']
        feature_dict['q0-pos'] = queue[0]['upostag']
        if isinstance(queue[0]['feats'], OrderedDict):
            for k, v in queue[0]['feats'].items():
                feature_dict[f'q0-{k}'] = v

    if len(queue) > 1:
        feature_dict['q1-form'] = queue[1]['form']
        feature_dict['q1-lemma'] = queue[1]['lemma']
        feature_dict['q1-pos'] = queue[1]['upostag']

    if len(queue) > 2:
        feature_dict['q2-form'] = queue[2]['form']
        feature_dict['q2-lemma'] = queue[2]['lemma']
        feature_dict['q2-pos'] = queue[2]['upostag']

    if len(stack) > 1:
        feature_dict['st0-form'] = stack[-1]['form']
        feature_dict['st0-lemma'] = stack[-1]['lemma']
        feature_dict['st0-pos'] = stack[-1]['upostag']
        if isinstance(stack[-1]['feats'], OrderedDict):
            for k, v in stack[-1]['feats'].items():
                feature_dict[f'q0-{k}'] = v

    if len(stack) > 2:
        feature_dict['st1-form'] = stack[-2]['form']
        feature_dict['st1-lemma'] = stack[-2]['lemma']
        feature_dict['st1-pos'] = stack[-2]['upostag']

    if len(stack) > 3:
        feature_dict['st2-form'] = stack[-3]['form']
        feature_dict['st2-lemma'] = stack[-3]['lemma']
        feature_dict['st2-pos'] = stack[-3]['upostag']

    if len(queue) > 0 and len(stack) > 0:
        feature_dict['distance'] = queue[0]["id"] - stack[-1]["id"]

    return feature_dict

In [5]:
from enum import Enum

class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"

def get_action(top_stack, top_queue, relations):
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack

    elif top_stack["id"] in [i[0] for i in relations] and \
         top_queue["head"] < top_stack["id"]:
        return Actions.REDUCE

    # default option
    else:
        return Actions.SHIFT

ROOT = OrderedDict([('id', 0),
                    ('form', 'ROOT'),
                    ('lemma', 'ROOT'),
                    ('upostag', None),
                    ('xpostag', None),
                    ('feats', None),
                    ('head', None),
                    ('deprel', None),
                    ('deps', None),
                    ('misc', None)])


In [33]:
def get_data(tree, estimator=None):
    datum = []
    
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        
        features = get_features(stack, queue, relations, tree)
        
        action = get_action(stack[-1] if len(stack) > 0 else None,
                            queue[0] if len(queue) > 0 else None,
                            relations)
        
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))

        elif action == Actions.REDUCE:
            stack.pop()

        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()

        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))

        datum += [(features, action.value)]

    return datum

## Split data and get feature arrays

In [8]:
from random import shuffle
from sklearn.feature_extraction import DictVectorizer
import numpy as np

shuffle(trees)
test_mark = int(len(trees) * 0.2)
test_trees = trees[: test_mark]
train_trees = trees[test_mark: ]

In [34]:
train = []
for tree in train_trees:
    if any(w['head'] == None for w in tree):
        continue
    train += get_data(tree)
    
test = []
for tree in test_trees:
    if any(w['head'] == None for w in tree):
        continue
    test += get_data(tree)

In [35]:
train_feats = [w[0] for w in train]
train_labs = [w[1] for w in train]

test_feats = [w[0] for w in test]
test_labs = [w[1] for w in test]

In [37]:
action_lr = Pipeline([('vect', DictVectorizer()),
                          ('lr', LogisticRegression())])

action_lr.fit(train_feats, train_labs)
action_lr.score(test_feats, test_labs)

0.8446341871756237

In [13]:
print(classification_report(action_lr.predict(test_feats), test_labs))

             precision    recall  f1-score   support

       left       0.88      0.86      0.87      7565
     reduce       0.81      0.88      0.84      7340
      right       0.82      0.78      0.80      7129
      shift       0.87      0.85      0.86      7831

avg / total       0.85      0.84      0.84     29865



In [14]:
confusion_matrix(action_lr.predict(test_feats), test_labs,
                 labels=['shift', 'reduce', 'left', 'right'])

array([[6679,  295,  377,  480],
       [ 114, 6482,  202,  542],
       [ 431,  458, 6494,  182],
       [ 484,  768,  307, 5570]])

In [38]:
def dep_parse(sentence, oracle):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if len(queue) > 0 and len(stack) == 0:
            print(' '.join([w['form'] for w in tree]), '\nOrphans in queue\n')
            return sorted(relations)
        if stack and not queue:
            stack.pop()
        else:
            features = get_features(stack, queue, relations, tree)
            action = oracle.predict(features)[0]
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

In [39]:
total, tp = 0, 0
for tree in test_trees:
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, action_lr)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))

У 1995-му Сінгапур із 26 дол . США на душу населення обійшов Британію ( 19 ) . 
Orphans in queue

Правила поводження з тваринами , що використовуються в наукових експериментах , тестуванні , навчальному процесі , виробництві біологічних препаратів 
Orphans in queue

Total: 14499
Correctly defined: 10335
UAS: 0.71


## Add relations data to features to improve accuracy

In [15]:
from sklearn.pipeline import Pipeline

In [64]:
def get_features_rel(stack, queue, relations, tree, is_test=True):
    feature_dict = {}
    if len(queue) > 0:
        feature_dict['q0-form'] = queue[0]['form']
        feature_dict['q0-lemma'] = queue[0]['lemma']
        feature_dict['q0-pos'] = queue[0]['upostag']
        if isinstance(queue[0]['feats'], OrderedDict):
            for k, v in queue[0]['feats'].items():
                feature_dict[f'q0-{k}'] = v

    if len(queue) > 1:
        feature_dict['q1-form'] = queue[1]['form']
        feature_dict['q1-lemma'] = queue[1]['lemma']
        feature_dict['q1-pos'] = queue[1]['upostag']

    if len(queue) > 2:
        feature_dict['q2-form'] = queue[2]['form']
        feature_dict['q2-lemma'] = queue[2]['lemma']
        feature_dict['q2-pos'] = queue[2]['upostag']

    if len(stack) > 1:
        feature_dict['st0-form'] = stack[-1]['form']
        feature_dict['st0-lemma'] = stack[-1]['lemma']
        feature_dict['st0-pos'] = stack[-1]['upostag']
        if isinstance(stack[-1]['feats'], OrderedDict):
            for k, v in stack[-1]['feats'].items():
                feature_dict[f'q0-{k}'] = v

        # Relations in stack
        if not is_test:
            st0_child_id = next((rel[0] for rel in relations if rel[1] == stack[-1]['id']), None)
            st0_child = next((tok for tok in tree if tok['id'] == st0_child_id), None)
            if st0_child:
                feature_dict['st0-child-form'] = st0_child['form']
                feature_dict['st0-child-lemma'] = st0_child['lemma']
                feature_dict['st0-child-pos'] = st0_child['upostag']

            st0_parent_id = next((rel[1] for rel in relations if rel[0] == stack[-1]['id']), None)
            st0_parent = next((tok for tok in tree if tok['id'] == st0_parent_id), None)
            if st0_parent:
                feature_dict['st0-parent-form'] = st0_parent['form']
                feature_dict['st0-parent-lemma'] = st0_parent['lemma']
                feature_dict['st0-parent-pos'] = st0_parent['upostag']

    if len(stack) > 2:
        feature_dict['st1-form'] = stack[-2]['form']
        feature_dict['st1-lemma'] = stack[-2]['lemma']
        feature_dict['st1-pos'] = stack[-2]['upostag']

    if len(stack) > 3:
        feature_dict['st2-form'] = stack[-3]['form']
        feature_dict['st2-lemma'] = stack[-3]['lemma']
        feature_dict['st2-pos'] = stack[-3]['upostag']

    if len(queue) > 0 and len(stack) > 0:
        feature_dict['distance'] = queue[0]["id"] - stack[-1]["id"]
        
    feature_dict['len_queue'] = len(queue)

    return feature_dict

In [65]:
def get_data(tree, is_test=True, estimator=None):
    datum = []
    
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        if len(queue) > 0 and len(stack) == 0:
            print(' '.join([w['form'] for w in tree]), '\nOrphans in queue\n')
            return datum
        
        if estimator:
            is_test = False
        
        features = get_features_rel(stack, queue, relations, tree, is_test=is_test)
        
        if estimator:
            action = estimator.predict([features])[0]
            action = Actions[action.upper()]
            
            top_stack = stack[-1] if len(stack) > 0 else None
            top_queue = queue[0] if len(queue) > 0 else None
            
            if top_stack and not top_queue:
                action = Actions.REDUCE
            
        else:
            action = get_action(stack[-1] if len(stack) > 0 else None,
                                queue[0] if len(queue) > 0 else None,
                                relations)            
        
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))

        elif action == Actions.REDUCE:
            stack.pop()

        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()

        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
            
        if estimator:
            datum += [(features, action.value)]
        else:
            datum += [(features, action.value)]

    return datum

In [66]:
train = []
for tree in train_trees:
    if any(w['head'] == None for w in tree):
        continue
    train += get_data(tree, is_test=False)

In [67]:
train_feats = [w[0] for w in train]
train_labs = [w[1] for w in train]

In [142]:
action_lr_rel = Pipeline([('vect', DictVectorizer()),
                          ('lr', LogisticRegression(n_jobs=3,
                                                    random_state=23,
                                                    max_iter=500))])

action_lr_rel.fit(train_feats, train_labs)

  " = {}.".format(self.n_jobs))


Pipeline(memory=None,
     steps=[('vect', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=23, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

**add relation data to test during prediction**<br/><br/>
    We can\`t use traditional metrics, because adding relations dynamically makes golden test tree and predicted one different

In [167]:
def dep_parse(sentence, oracle):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if len(queue) > 0 and len(stack) == 0:
            print('There were orphans in queue\n')
            return sorted(relations)
        if stack and not queue:
            stack.pop()
        else:
            features = get_features_rel(stack, queue, relations, tree, is_test=False)
            action = oracle.predict(features)[0]
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

In [30]:
total, tp = 0, 0
for tree in test_trees:
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, action_lr_rel)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))

Красуня 
Orphans in queue

Правила поводження з тваринами , що використовуються в наукових експериментах , тестуванні , навчальному процесі , виробництві біологічних препаратів 
Orphans in queue

Total: 14499
Correctly defined: 10638
UAS: 0.73


**\+2% to UAS score, comparing previous version, the one that did not account for relations**

Let's optimize parameters for classifier

In [53]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [None]:
%%time
param_grid = {
    'lr__penalty': ['l2'],
    'lr__C': np.arange(0.5, 1.0, 0.1),
    'lr__intercept_scaling': np.arange(1, 3, 1),
    'lr__class_weight': [None, 'balanced'],
    'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'lr__multi_class': ['multinomial', 'ovr'],
}

param_search = RandomizedSearchCV(action_lr_rel, param_grid, n_iter=10, verbose=2)
param_search.fit(train_feats, train_labs)

In [60]:
pd.DataFrame(param_search.cv_results_).sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_lr__C,param_lr__class_weight,param_lr__intercept_scaling,param_lr__multi_class,param_lr__penalty,param_lr__solver,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
3,77.590307,1.120119,0.840375,0.90592,0.6,,1,multinomial,l2,sag,...,0.84039,0.892933,0.83591,0.894462,0.844824,0.930365,0.911937,0.008252,0.003639,0.017296
6,77.72217,1.123349,0.840295,0.903459,0.5,balanced,1,multinomial,l2,sag,...,0.839765,0.890564,0.835934,0.891901,0.845185,0.927912,0.516441,0.004192,0.003795,0.017299
7,82.550735,1.121272,0.837986,0.87967,0.7,balanced,1,multinomial,l2,saga,...,0.836736,0.868273,0.832712,0.868998,0.844511,0.901738,0.231906,0.004598,0.004897,0.015608
1,88.959078,1.146315,0.837682,0.881934,0.8,,2,multinomial,l2,saga,...,0.836639,0.870377,0.832159,0.871258,0.844247,0.904167,5.549765,0.036134,0.004989,0.015725
2,84.651601,1.230611,0.837361,0.98126,0.9,,2,multinomial,l2,lbfgs,...,0.837505,0.981135,0.834299,0.981737,0.840279,0.980908,2.35467,0.131903,0.002443,0.00035
4,32.12173,1.142227,0.833586,0.952177,0.6,balanced,1,ovr,l2,newton-cg,...,0.832504,0.951955,0.830452,0.952569,0.837802,0.952005,1.068308,0.005531,0.003097,0.000278
8,36.183931,1.138237,0.832632,0.96565,0.9,balanced,1,ovr,l2,newton-cg,...,0.831542,0.965625,0.829322,0.965638,0.837033,0.965687,0.120272,0.006335,0.003241,2.7e-05
0,339.528357,1.238817,0.832504,0.879389,0.5,,1,ovr,l2,sag,...,0.830532,0.868057,0.828408,0.868745,0.838572,0.901366,11.851931,0.057442,0.004377,0.015542
5,350.488326,1.123182,0.831045,0.880203,0.5,balanced,2,ovr,l2,sag,...,0.829787,0.86855,0.827013,0.869971,0.836335,0.902087,2.009454,0.001747,0.003908,0.015486
9,353.074894,1.152179,0.826813,0.859166,0.5,balanced,2,ovr,l2,saga,...,0.824713,0.848903,0.821531,0.850843,0.834195,0.877753,0.617123,0.06532,0.005379,0.013167


In [68]:
calibrated_lr_rel = param_search.best_estimator_
calibrated_lr_rel.fit(train_feats, train_labs)



Pipeline(memory=None,
     steps=[('vect', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('lr', LogisticRegression(C=0.6, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='multinomial',
          n_jobs=3, penalty='l2', random_state=23, solver='sag',
          tol=0.0001, verbose=0, warm_start=False))])

In [69]:
total, tp = 0, 0
for tree in test_trees:
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, calibrated_lr_rel)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))

дуже потрібно , дякую 
Orphans in queue

Вилов та тимчасова ізоляція домашніх тварин 
Orphans in queue

Красуня 
Orphans in queue

Total: 14499
Correctly defined: 10560
UAS: 0.73


bad, bad randomized parameter search, use previous classifier

## Just use it: parse plain sentence

In [75]:
import pymorphy2
from tokenize_uk import tokenize_words

morph = pymorphy2.MorphAnalyzer(lang='uk')

In [122]:
pm2ud = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

CONJ_COORD = ["а", "або", "але", "ані", "все", "все-таки", "втім", "ж", "же",
              "зате", "і", "й", "ніже", "однак", "одначе", "прецінь", "проте",
              "та", "так", "також", "усе", "усе-таки", "утім", "чи"]

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if word.word in CONJ_COORD:
            return "CCONJ"
        else:
            return "SCONJ"
    else:
        return pm2ud.get(word.tag.POS, word.tag.POS)

In [134]:
def get_word_dict(i, w):
    parsed = morph.parse(w)
    if len(parsed) < 1:
        return {
            'form': w,
            'lemma': w,
            'pos': None,
        }
    parsed = parsed[0]
    return {
        'id': i,
        'form': w,
        'lemma': parsed.normal_form,
        'upostag': normalize_pos(parsed),
        'feats': None
    }

In [174]:
def parse_new_sent(sent):
    print('----------------\n' + sent)
    sent = tokenize_words(sent)
    sent_word_dicts = [get_word_dict(i + 1, w) for i, w in enumerate(sent)]
    relations = dep_parse(sent_word_dicts, action_lr_rel)
    sent = ['root'] + sent
    for child, head in relations:
        print(f'{sent[child]} <--- {sent[head]}')
    return relations

In [175]:
parse_new_sent('Маленькі собаки залишаються щенятами.')
parse_new_sent('Хто розлив каву на столі?')
parse_new_sent('Щенята грались на траві й кусали своїх господарів')
parse_new_sent('Сонце гріє лице, радує це.')
parse_new_sent('Полюбляю тебе, відпустка.')

----------------
Маленькі собаки залишаються щенятами.
There were orphans in queue

Маленькі <--- залишаються
собаки <--- Маленькі
залишаються <--- root
щенятами <--- залишаються
----------------
Хто розлив каву на столі?
There were orphans in queue

розлив <--- Хто
каву <--- Хто
столі <--- на
----------------
Щенята грались на траві й кусали своїх господарів
Щенята <--- root
Щенята <--- грались
на <--- кусали
траві <--- на
й <--- кусали
кусали <--- грались
своїх <--- кусали
господарів <--- грались
----------------
Сонце гріє лице, радує це.
There were orphans in queue

Сонце <--- root
гріє <--- Сонце
лице <--- Сонце
, <--- радує
радує <--- Сонце
це <--- Сонце
----------------
Полюбляю тебе, відпустка.
There were orphans in queue

Полюбляю <--- root
тебе <--- Полюбляю
, <--- Полюбляю
відпустка <--- Полюбляю


[(1, 0), (2, 1), (3, 1), (4, 1)]

## Summary

Парсер показав непогані результати на тестовій вибірці, проте робить очевидні помилки на незнайомих реченнях. Чому? Я думаю, причина в тому, що 
    - речення у тренувальній вибірці було взято з літератури, багато з них не відображають сучасний стиль мовлення. 
    - парсер натреновано на обмеженому словнику, а слово, його лемма - це змінні в моделі.
  
З "зовнішніх" речень видно, що парсер погано впорується з питальними й скадними реченнями.

In [170]:
# from collections import defaultdict

# ud_feat_options = defaultdict(set)
# for d in train_feats:
#     for k, v in d.items():
#         if not re.search('form|lemma|-pos|distance|len_', k):
#             ud_feat_options[k.replace('q0-', '')].add(v)
# ud_feat_options