In [None]:
# https://explosion.ai/blog/parsing-english-in-python#features

In [21]:
import numpy as np
from conllu import parse, parse_tree
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer

In [22]:
def read_conllu_file(filename):
    with open(filename) as input_file:
        text = input_file.read()
        result = parse(text)
    return result
    
    
train = read_conllu_file('../../../../UD_Ukrainian-IU/uk_iu-ud-train.conllu')
test = read_conllu_file('../../../../UD_Ukrainian-IU/uk_iu-ud-dev.conllu')
val = read_conllu_file('../../../../UD_Ukrainian-IU/uk_iu-ud-test.conllu')

In [23]:
def parsed_to_sent(parsed_toks):
    return ' '.join([t['form'] for t in parsed_toks])

In [24]:
parsed_to_sent(train[0])

'–£ –¥–æ–º—ñ —Ä–∏–º—Å—å–∫–æ–≥–æ –ø–∞—Ç—Ä–∏—Ü—ñ—è –†—É—Ñ—ñ–Ω–∞ –±—É–ª–∞ –ø—Ä–µ–≥–∞—Ä–Ω–∞ —Ñ—Ä–µ—Å–∫–∞ , –∑–æ–±—Ä–∞–∂–µ–Ω–Ω—è –í–µ–Ω–µ—Ä–∏ —Ç–∞ –ê–¥–æ–Ω—ñ—Å–∞ .'

In [25]:
train[0]

[OrderedDict([('id', 1),
              ('form', '–£'),
              ('lemma', '—É'),
              ('upostag', 'ADP'),
              ('xpostag', 'Spsl'),
              ('feats', OrderedDict([('Case', 'Loc')])),
              ('head', 2),
              ('deprel', 'case'),
              ('deps', None),
              ('misc', OrderedDict([('Id', '0003')]))]),
 OrderedDict([('id', 2),
              ('form', '–¥–æ–º—ñ'),
              ('lemma', '–¥—ñ–º'),
              ('upostag', 'NOUN'),
              ('xpostag', 'Ncmsln'),
              ('feats',
               OrderedDict([('Animacy', 'Inan'),
                            ('Case', 'Loc'),
                            ('Gender', 'Masc'),
                            ('Number', 'Sing')])),
              ('head', 6),
              ('deprel', 'obl'),
              ('deps', None),
              ('misc', OrderedDict([('Id', '0004')]))]),
 OrderedDict([('id', 3),
              ('form', '—Ä–∏–º—Å—å–∫–æ–≥–æ'),
              ('lemma', '—Ä–∏–º—Å—å–∫

In [246]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def shift_by_1(array):
    return array[1:]
def head(tok):
    return tok['head'] if 'head' in tok else 0

def is_left_arc(tok1, tok2):
    return head(tok1) == tok2['id']

def is_right_arc(tok1, tok2):
    return tok1['id'] == head(tok2)

def has_parent(tok1, rels):
    all_with_parent = [pair[0] for pair in rels]
    return tok1['id'] in all_with_parent

def head_is_in_stack(queue_el, stack_el):
    return head(queue_el) < stack_el['id']

def return_static_oracle_action(s0, q0, rel, *args):
    if s0 is None:
        action = 'SHIFT'
    elif q0 is None:
        action = 'REDUCE'
    elif is_left_arc(s0, q0):
        action = 'LEFT'
    elif is_right_arc(s0, q0):
        action = 'RIGHT'
    elif has_parent(s0, rel) and head_is_in_stack(q0, s0):
        action = 'REDUCE'
    else:
        action = 'SHIFT'
    return action


# rels = [(child,parent)]
def unwrap_to_relations(tree, get_action, extra_attrs = {}):
    stack = [ROOT]
    all_toks = tree.copy()
    queue = tree.copy()
    rel = []
    actions = []
    while len(queue) or len(stack):
        s0 = stack[-1] if len(stack) else None
        q0 = queue[0] if len(queue) else None
        action = get_action(s0, q0, rel, stack, queue, all_toks, extra_attrs)
        actions.append(action)
        if action == 'LEFT':
            if s0 and q0:
                ids = (s0['id'], q0['id'])
                if not ids in rel:
                    rel.append(ids)
                stack.pop()
        elif action == 'RIGHT':
            if s0 and q0:
                ids = (q0['id'], s0['id'])
                if not ids in rel:
                    rel.append(ids)
                stack.append(q0)
                queue = shift_by_1(queue)
        elif action == 'REDUCE':
            if len(stack):
                stack.pop()
        elif action == 'SHIFT':
            if not q0 is None:
                stack.append(q0)
                queue = shift_by_1(queue)
        else:
            raise Exception('Invalid action', action)

    if len(actions) != len(tree) * 2 + 1:
        print('Warning: len of actions is not tree*2+1: {} vs {}'.format(len(actions), len(tree) * 2 + 1))
    return rel, actions

In [247]:
tree = train[0]
unwrap_to_relations(tree, return_static_oracle_action)

([(1, 2),
  (3, 4),
  (4, 2),
  (5, 4),
  (2, 6),
  (6, 0),
  (7, 8),
  (8, 6),
  (9, 10),
  (10, 8),
  (11, 10),
  (12, 13),
  (13, 11),
  (14, 6)],
 ['SHIFT',
  'LEFT',
  'SHIFT',
  'SHIFT',
  'LEFT',
  'RIGHT',
  'RIGHT',
  'REDUCE',
  'REDUCE',
  'LEFT',
  'RIGHT',
  'SHIFT',
  'LEFT',
  'RIGHT',
  'SHIFT',
  'LEFT',
  'RIGHT',
  'RIGHT',
  'SHIFT',
  'LEFT',
  'RIGHT',
  'REDUCE',
  'REDUCE',
  'REDUCE',
  'REDUCE',
  'RIGHT',
  'REDUCE',
  'REDUCE',
  'REDUCE'])

In [28]:
def unwrap_gold_relations(tree):
    return [(tok['id'], head(tok)) for tok in tree]

In [29]:
rel, actions = unwrap_to_relations(tree, return_static_oracle_action)
g_rel = unwrap_gold_relations(tree)
set(rel) == set(g_rel)

True

![](./features.png)

From the spacy post:
- The first three words of the buffer (n0, n1, n2)
- The top three words of the stack (s0, s1, s2)
- The two leftmost children of s0 (s0b1, s0b2);
- The two rightmost children of s0 (s0f1, s0f2);
- The two leftmost children of n0 (n0b1, n0b2)

For these 12 tokens, we refer to the word-form, the part-of-speech tag, and the number of left and right children attached to the token.

In [76]:
def get_children_count(tok, rels):
    count = 0
    for (child_id, parent_id) in rels:
        if tok['id'] == parent_id:
            count += 1
    return count
def get_parents_count(tok, rels):
    count = 0
    for (child_id, parent_id) in rels:
        if tok['id'] == child_id:
            count += 1
    return count

def single_word_features(label, dict_vals, keys, rels):
    res = {}
    if dict_vals is None:
        return {}
    for key in keys:
        if key == 'child_count':
            res[label + '_c_count'] = get_children_count(dict_vals, rels)
        elif key == 'parent_count':
            res[label + '_p_count'] = get_parents_count(dict_vals, rels)
        elif (type(dict_vals[key]) == OrderedDict):
            for inner_key in dict_vals[key]:
                res[label + '_' + key + '_' + inner_key] = dict_vals[key][inner_key]
        else:
            res[label + '_' + key] = dict_vals[key]
    return res

def to_features(stk, buf, rels, all_toks):
    def get_by_id(id):
        for tok in all_toks:
            if tok['id'] == id:
                return tok
        return None

    def get_child_1(tok):
        if not tok:
            return None
        for (child_id, parent_id) in rels:
            if tok['id'] == parent_id:
                return get_by_id(child_id)
        return None
    def get_head_1(tok):
        if not tok:
            return None
        for (child_id, parent_id) in rels:
            if tok['id'] == child_id:
                return get_by_id(parent_id)
        return None

    def get_n(col, n):
        if col and 0 <= n < len(col):
            return col[n]
        else:
            return None
    res = {
        **single_word_features('stk_0', get_n(stk, 0), ['form', 'upostag', 'child_count', 'parent_count'], rels),
        **single_word_features('stk_1', get_n(stk, 1), ['form', 'upostag', 'child_count', 'parent_count'], rels),
        **single_word_features('stk_2', get_n(stk, 2), ['form', 'upostag'], rels),
        **single_word_features('ldep_stk_0', get_head_1(get_n(stk, 0)), ['form', 'upostag'], rels),
        **single_word_features('rdep_stk_0', get_child_1(get_n(stk, 0)), ['form', 'upostag'], rels),
        **single_word_features('buf_0', get_n(buf, 0), ['form', 'upostag'], rels),
        **single_word_features('buf_1', get_n(buf, 1), ['form', 'upostag'], rels),
        **single_word_features('buf_2', get_n(buf, 2), ['form', 'upostag'], rels),
        **single_word_features('ldep_buf_0', get_head_1(get_n(buf, 0)), ['form', 'upostag'], rels),
        **single_word_features('rdep_buf_0', get_child_1(get_n(buf, 0)), ['upostag'], rels)
        # **single_word_features('buf_1', get_n(buf, 0), ['form', 'lemma', 'upostag', 'feats', 'deprel']),
    }
    if len(stk) and len(buf):
        res["distance"] = buf[0]["id"] - stk[-1]["id"]
    return res

def create_oracle_storing_data():
    Xs = []
    ys = []
    def oracle_extracting_data(s0, q0, rels, stk, buf, all_toks, extra_attrs = {}):
        action = return_static_oracle_action(s0, q0, rels, stk, buf, all_toks)
        X = to_features(stk, buf, rels, all_toks)
        y = action
        X.update(extra_attrs)        
        Xs.append(X)
        ys.append(y)
        return action
    return Xs, ys, oracle_extracting_data


In [77]:
train_len = len(train)
train_X, train_y, extractor = create_oracle_storing_data()
for (i, tree) in enumerate(train):
    unwrap_to_relations(tree, extractor)
"DONE"

'DONE'

In [78]:
test_len = len(test)
test_X, test_y, extractor = create_oracle_storing_data()
for (i, tree) in enumerate(test):
    unwrap_to_relations(tree, extractor)
"DONE"

'DONE'

In [79]:
val_len = len(val)
val_X, val_y, extractor = create_oracle_storing_data()
for (i, tree) in enumerate(val):
    unwrap_to_relations(tree, extractor)
"DONE"

'DONE'

In [80]:
dict_vectorizer = DictVectorizer()
dict_vectorizer.fit(train_X + test_X + val_X)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [81]:
def fill_nans(sparse_matrix):
    sparse_matrix.data = np.nan_to_num(sparse_matrix.data)
    return sparse_matrix

In [82]:
train_features_X = fill_nans(dict_vectorizer.transform(train_X))
test_features_X = fill_nans(dict_vectorizer.transform(test_X))

In [83]:
predictor = LogisticRegression(random_state=42)
predictor.fit(train_features_X, train_y)
train_y_predicted = predictor.predict(train_features_X)
test_y_predicted = predictor.predict(test_features_X)

In [84]:
print(classification_report(train_y, train_y_predicted))

             precision    recall  f1-score   support

       LEFT       0.77      0.88      0.82     38446
     REDUCE       0.90      0.81      0.85     41165
      RIGHT       0.83      0.78      0.80     34669
      SHIFT       0.92      0.93      0.93     40429

avg / total       0.86      0.85      0.85    154709



```
             precision    recall  f1-score   support

       LEFT       0.65      0.87      0.74     38446
     REDUCE       0.84      0.69      0.76     41165
      RIGHT       0.73      0.59      0.65     34669
      SHIFT       0.89      0.91      0.90     40429

avg / total       0.78      0.77      0.77    154709
```

In [85]:
print(classification_report(test_y, test_y_predicted))

             precision    recall  f1-score   support

       LEFT       0.65      0.74      0.69      5127
     REDUCE       0.87      0.66      0.75      5821
      RIGHT       0.61      0.67      0.64      4972
      SHIFT       0.83      0.86      0.84      5399

avg / total       0.75      0.73      0.73     21319



```
     precision    recall  f1-score   support

       LEFT       0.58      0.73      0.65      5127
     REDUCE       0.85      0.61      0.71      5821
      RIGHT       0.54      0.54      0.54      4972
      SHIFT       0.83      0.87      0.85      5399

avg / total       0.71      0.69      0.69     21319
```

~~Wow, something is off here, as I get 10% worse average performance compared to results from @mariana-scorp. I even copied random_state for predictor, assuming that issue might be originating from there, but ~ same result. ü§î~~ Issue here was me adding sentence_i features which added noise to the data, removing it fixed this specific issue

In [131]:
def get_valid_moves(stack_len, queue_len):
    moves = []
    if queue_len > 0:
        moves.append('SHIFT')
    if stack_len > 0 and queue_len > 0:
        moves.append('LEFT')
        moves.append('RIGHT')
    if stack_len > 0:
        moves.append('REDUCE')
    return moves
def oracle_from_predictor(hasher, predictor):
    def action_from_predictor(s0, q0, rels, stk, buf, all_toks, extra_attrs = {}):
        possible_moves = get_valid_moves(len(stk), len(buf))
        in_X = to_features(stk, buf, rels, all_toks)
        in_X = fill_nans(hasher.transform(in_X))
        all_possibilities = list(zip(predictor.classes_, predictor.predict_proba(in_X)[0]))
        valid_possibilities = [pair for pair in all_possibilities if pair[0] in possible_moves]
        max_prob = sorted(valid_possibilities, key=lambda pair: -pair[1])[0][0]
        return max_prob
    return action_from_predictor

def get_uas(dataset, oracle):
    total = 0
    tp = 0
    for sample in dataset:
        rel_gold, _ = unwrap_to_relations(tree, return_static_oracle_action)
        rel_ours, _ = unwrap_to_relations(tree, oracle)
        total += len(sample)
        tp += len(set(rel_gold) & set(rel_ours))
    return tp/total , tp, total

In [132]:
our_oracle = oracle_from_predictor(dict_vectorizer, predictor)
ratio, tp, total = get_uas(train, our_oracle)
print('On {}. {} ({} out of {} are correct)'.format('train', ratio, tp, total))
ratio, tp, total = get_uas(test, our_oracle)
print('On {}. {} ({} out of {} are correct)'.format('test', ratio, tp, total))

On train. 0.5408532850408799 (40617 out of 75098 are correct)
On test. 0.5007231703789413 (5193 out of 10371 are correct)


```
On train. 0.5408532850408799 (40617 out of 75098 are correct)
On test. 0.5007231703789413 (5193 out of 10371 are correct)
```

TODO:
- ‚úÖfix issue with low metrics
- ‚úÖadd new features (counts as improving the algo):
- üö´possibly dyn oracle
- ‚úÖrun on new sentences

### Add new features
- adding ldep/rdep for stack[0] / queue[0] bumped avg precision/recall by 0.01 across both train / test, but it made uas drop by ~0.18 (to 0.36 on train and 0.33 on test)
      - adding children / parent count for stk_0/buf_0 slightly improved peformance of action predictor on train, almost didn't on test, yet uas didn't change at all
      - redid features similar to what described in https://explosion.ai/blog/parsing-english-in-python#features keeping following:
        1. 'stk_0' - ['form', 'upostag', 'child_count', 'parent_count']
        2. 'stk_1' - ['form', 'upostag', 'child_count', 'parent_count']
        3. 'stk_2' - ['form', 'upostag', 'child_count', 'parent_count']
        4. 'ldep_stk_0' - ['form', 'upostag', 'child_count', 'parent_count']
        5. 'rdep_stk_0' - ['form', 'upostag', 'child_count', 'parent_count']
        6. 'buf_0' - ['form', 'upostag', 'child_count', 'parent_count']
        7. 'buf_1' - ['form', 'upostag', 'child_count', 'parent_count']
        8. 'buf_2' - ['form', 'upostag', 'child_count', 'parent_count']
        9. 'ldep_buf_0' ‚Äì ['form', 'upostag', 'child_count', 'parent_count']
      It gave following results for action classification on train
                 precision    recall  f1-score   support

           LEFT       0.82      0.89      0.85     38446
         REDUCE       0.90      0.85      0.87     41165
          RIGHT       0.84      0.80      0.82     34669
          SHIFT       0.92      0.93      0.92     40429

    avg / total       0.87      0.87      0.87    154709
    
    and on test  
                 precision    recall  f1-score   support

           LEFT       0.63      0.72      0.67      5127
         REDUCE       0.84      0.68      0.75      5821
          RIGHT       0.61      0.64      0.62      4972
          SHIFT       0.81      0.85      0.83      5399

    avg / total       0.73      0.72      0.72     21319  
    
    and still gave same result on uas for train/test
    - trying to figure out whether I can leave some of the features away, removed some of the features and ended up with some endless loops which might need special handling / default fallback for actions. Updated the code to have predict probabilities and filter out invalid moves (and choosing the most probable valid move) - it didn't affect the score, but supposedly, must make it possible to fully evaluate on any corpora
    - not sure, but might be that 0.5 is upper bond for this classificator / approach on test set

### Running on new sentences

In order to run on new sentence, I need 
1. form / upostag for each token in sentence
2. head / id for each token in sentence

In [258]:
import subprocess
from difflib import Differ
from pprint import pprint
# import tokenize_uk

–î–ª—è –æ—Ç—Ä–∏–º–∞–Ω–Ω—è –Ω–µ–æ–±—Ö—ñ–¥–Ω–∏—Ö —Ñ—ñ—á –≤–∏–∫–æ—Ä–∏—Å—Ç–∞—é http://ufal.mff.cuni.cz/udpipe/users-manual ‚Äì —Å—Ö–æ–∂–µ –Ω–∞ –≥–∞—Ä–Ω–∏–π —ñ–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç, –∞–ª–µ –ø—ñ–¥ –º–æ—é –û–° –¥–ª—è REST —Å–µ—Ä–≤—ñ—Å–∞ –Ω–µ–æ–±—Ö—ñ–¥–Ω–æ –∫–æ–º–ø—ñ–ª—é–≤–∞—Ç–∏, —Ç–æ–º—É –±—É–¥—É –≤–∏–∫–ª–∏–∫–∞—Ç–∏ –π–æ–≥–æ —Å –∫–æ–º–∞–Ω–¥–Ω—Ä–æ–≥–æ —Ä—è–¥–∫–∞

In [260]:
def trace_tree(tree):
    res = []
    for node in tree:
        head = node["head"]
        res.append("{} <-- {}\n".format(node["form"],
                                 tree[head - 1]["form"]
                                 if head > 0 else "root"))
    return res
def get_head_id(tok, rels):
    if not tok:
        return None
    for (child_id, parent_id) in rels:
        if tok['id'] == child_id:
            return parent_id
    return None
def trace_rels(tree, rels):
    res = []
    for node in tree:
        head = get_head_id(node, rels)
        if head is None:
            res.append("{} <-- N/A\n".format(node["form"]))
            continue
        res.append("{} <-- {}\n".format(node["form"],
                         tree[head - 1]["form"]
                         if head > 0 else "root"))
    return res
def sentence_to_dicts(sent):
    res = []
    result = subprocess.getoutput("echo \"{}\" | /Users/sudodoki/Downloads/udpipe-1.2.0-bin/bin-osx/udpipe --tokenize --tag --parse /Users/sudodoki/Downloads/Universal\ Dependencies\ 2.0\ Models\ for\ UDPipe\ \(2017-08-01\)/udpipe-ud-2.0-170801/ukrainian-ud-2.0-170801.udpipe".format(sent))
    lines = result.split('\n')[5:-1]
    for line in lines:
        id, form, lemma, upostag, xpostag, feats, head, deprel, _, _ = line.split('\t')
        tok_dict = OrderedDict([('id', int(id)), ('form', form), ('lemma', lemma),
                                ('upostag', upostag), ('xpostag', xpostag), ('feats', feats),
                               ('head', int(head)), ('deprel', deprel)])
        res.append(tok_dict)
    return res

In [261]:
diff = Differ()
def compare(sentence):
    gold = sentence_to_dicts(sentence)
    rel_gold, _ = unwrap_to_relations(gold, return_static_oracle_action)
    gold_res = trace_tree(gold)
    
    rel_ours, _ = unwrap_to_relations(gold, our_oracle)
    our_res = trace_rels(gold, rel_ours)
    
    total = len(gold)
    tp = len(set(rel_gold) & set(rel_ours))
    print("Got {} ({} out of {})".format(tp / total, tp, total))
    
    result = list(diff.compare(gold_res, our_res))
    pprint(result)

In [262]:
compare("–ü—Ä–∏–≥–∞–¥—É—é, —É–∂–µ –∑–≥–æ–¥–æ–º, –∫–æ–ª–∏ —è –≤—ñ–¥–±—É–≤–∞–≤ —Å–≤—ñ–π —Ç–µ—Ä–º—ñ–Ω —É —Ç–∞–±–æ—Ä—ñ ‚Ññ 36 —É –ö—É—á–∏–Ω–æ –ü–µ—Ä–º—Å—å–∫–æ—ó –æ–±–ª–∞—Å—Ç—ñ, —è –æ—Ç—Ä–∏–º–∞–≤ –≤—ñ–¥ –ú–∏—Ö–∞—Å—ñ –ª–∏—Å—Ç—ñ–≤–∫—É –∑ –∂–∞—Ä—Ç—ñ–≤–ª–∏–≤–∏–º –æ–ø–∏—Å–æ–º —Ç–æ–≥–æ, —è–∫ –ö–∏—ó–≤ –≥–æ—Ç—É—î—Ç—å—Å—è –¥–æ —Å–≤—è—Ç–∫—É–≤–∞–Ω–Ω—è —Å–≤–æ–≥–æ 1500-–ª—ñ—Ç—Ç—è.")

Got 0.4864864864864865 (18 out of 37)
['  –ü—Ä–∏–≥–∞–¥—É—é <-- root\n',
 '- , <-- –∑–≥–æ–¥–æ–º\n',
 '- —É–∂–µ <-- –∑–≥–æ–¥–æ–º\n',
 '- –∑–≥–æ–¥–æ–º <-- –ü—Ä–∏–≥–∞–¥—É—é\n',
 '? ^^^^^^\n',
 '+ , <-- –ü—Ä–∏–≥–∞–¥—É—é\n',
 '? ^\n',
 '+ —É–∂–µ <-- –≤—ñ–¥–±—É–≤–∞–≤\n',
 '+ –∑–≥–æ–¥–æ–º <-- –≤—ñ–¥–±—É–≤–∞–≤\n',
 '  , <-- –≤—ñ–¥–±—É–≤–∞–≤\n',
 '  –∫–æ–ª–∏ <-- –≤—ñ–¥–±—É–≤–∞–≤\n',
 '  —è <-- –≤—ñ–¥–±—É–≤–∞–≤\n',
 '- –≤—ñ–¥–±—É–≤–∞–≤ <-- –∑–≥–æ–¥–æ–º\n',
 '?              ^ ^ ^^\n',
 '+ –≤—ñ–¥–±—É–≤–∞–≤ <-- –ü—Ä–∏–≥–∞–¥—É—é\n',
 '?              ^^^ ^ ^^\n',
 '  —Å–≤—ñ–π <-- —Ç–µ—Ä–º—ñ–Ω\n',
 '  —Ç–µ—Ä–º—ñ–Ω <-- –≤—ñ–¥–±—É–≤–∞–≤\n',
 '  —É <-- —Ç–∞–±–æ—Ä—ñ\n',
 '- —Ç–∞–±–æ—Ä—ñ <-- –≤—ñ–¥–±—É–≤–∞–≤\n',
 '- ‚Ññ <-- –æ–±–ª–∞—Å—Ç—ñ\n',
 '- 36 <-- –æ–±–ª–∞—Å—Ç—ñ\n',
 '- —É <-- –æ–±–ª–∞—Å—Ç—ñ\n',
 '- –ö—É—á–∏–Ω–æ <-- –ü–µ—Ä–º—Å—å–∫–æ—ó\n',
 '+ —Ç–∞–±–æ—Ä—ñ <-- —Ç–µ—Ä–º—ñ–Ω\n',
 '+ ‚Ññ <-- —Ç–∞–±–æ—Ä—ñ\n',
 '+ 36 <-- ‚Ññ\n',
 '+ —É <-- N/A\n',
 '+ –ö—É—á–∏–Ω–æ <-- N/A\n',
 '  –ü–µ—Ä–º—Å—å–∫–æ—ó <-- –æ–±

In [263]:
compare("6C –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è –Ω–∞ –ø–ª–µ—á–µ, –ø–µ—Ä–µ–∫–æ—á—É—é—á–∏—Å—å, –ø—Ä–æ–ª—ñ—Ç–∞—î –º–µ—Ç—Ä—ñ–≤ –ø‚Äô—è—Ç–¥–µ—Å—è—Ç —ñ –≤–∏—Ç—è–≥—É—î—Ç—å—Å—è –Ω–∞ —Å–Ω—ñ–≥—É –∑–∞ –∫—ñ–ª—å–∫–∞ –∫—Ä–æ–∫—ñ–≤ –≤—ñ–¥ –∑–∞–±—Ä–∏–∑–∫–∞–Ω–æ—ó –ø–∞–ª–∞—é—á–∏–º–∏ —É–ª–∞–º–∫–∞–º–∏ –ø–æ—Å–∞–¥–∫–æ–≤–æ—ó —Å–º—É–≥–∏.")

Got 0.64 (16 out of 25)
['- 6 <-- C\n',
 '- C <-- –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è\n',
 '? ^\n',
 '+ 6 <-- –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è\n',
 '? ^\n',
 '+ C <-- 6\n',
 '  –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è <-- root\n',
 '  –Ω–∞ <-- –ø–ª–µ—á–µ\n',
 '  –ø–ª–µ—á–µ <-- –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è\n',
 '  , <-- –ø–µ—Ä–µ–∫–æ—á—É—é—á–∏—Å—å\n',
 '  –ø–µ—Ä–µ–∫–æ—á—É—é—á–∏—Å—å <-- –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è\n',
 '  , <-- –ø—Ä–æ–ª—ñ—Ç–∞—î\n',
 '  –ø—Ä–æ–ª—ñ—Ç–∞—î <-- –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è\n',
 '- –º–µ—Ç—Ä—ñ–≤ <-- –ø—Ä–æ–ª—ñ—Ç–∞—î\n',
 '+ –º–µ—Ç—Ä—ñ–≤ <-- –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è\n',
 '- –ø‚Äô—è—Ç–¥–µ—Å—è—Ç <-- –º–µ—Ç—Ä—ñ–≤\n',
 '?               ^^^^^^\n',
 '+ –ø‚Äô—è—Ç–¥–µ—Å—è—Ç <-- N/A\n',
 '?               ^^^\n',
 '  —ñ <-- –≤–∏—Ç—è–≥—É—î—Ç—å—Å—è\n',
 '- –≤–∏—Ç—è–≥—É—î—Ç—å—Å—è <-- –ø—Ä–∏–∑–µ–º–ª—è—î—Ç—å—Å—è\n',
 '?                  ^^^ ------\n',
 '+ –≤–∏—Ç—è–≥—É—î—Ç—å—Å—è <-- –ø‚Äô—è—Ç–¥–µ—Å—è—Ç\n',
 '?                  ^^^^   +\n',
 '  –Ω–∞ <-- —Å–Ω—ñ–≥—É\n',
 '  —Å–Ω—ñ–≥—É <-- –≤–∏—Ç—è–≥—É—î—Ç—å—Å—è\n

In [264]:
compare("–î—ñ–≤—á–∏–Ω–∞ —Å—Ç–æ—è–ª–∞ —Ç–∞–º, –¥–µ –π –±—É–ª–∞, —ñ –Ω–∞–º–∞–≥–∞–ª–∞—Å—è –ø—Ä–∏–≤–µ—Å—Ç–∏ –¥–æ –ª–∞–¥—É —Å–∫—É–π–æ–≤–¥–∂–µ–Ω–µ –≤–æ–ª–æ—Å—Å—è, –≤–∫—Ä–∞–π —Ä–æ–∑–ª—é—á–µ–Ω–∞ —Ç–∏–º, —â–æ —Ü–µ –ø–æ–±–∞—á–∏–ª–∏ –≤–æ–¥—ñ—ó, —è–∫—ñ —á–µ–∫–∞–ª–∏ –Ω–∞ –ø–µ—Ä–µ—ó–∑–¥—ñ.")

Got 0.5666666666666667 (17 out of 30)
['  –î—ñ–≤—á–∏–Ω–∞ <-- —Å—Ç–æ—è–ª–∞\n',
 '  —Å—Ç–æ—è–ª–∞ <-- root\n',
 '  —Ç–∞–º <-- —Å—Ç–æ—è–ª–∞\n',
 '- , <-- –¥–µ\n',
 '- –¥–µ <-- —Ç–∞–º\n',
 '- –π <-- –±—É–ª–∞\n',
 '- –±—É–ª–∞ <-- –¥–µ\n',
 '- , <-- –¥–µ\n',
 '+ , <-- N/A\n',
 '+ –¥–µ <-- N/A\n',
 '+ –π <-- N/A\n',
 '+ –±—É–ª–∞ <-- —Å—Ç–æ—è–ª–∞\n',
 '+ , <-- N/A\n',
 '  —ñ <-- –Ω–∞–º–∞–≥–∞–ª–∞—Å—è\n',
 '  –Ω–∞–º–∞–≥–∞–ª–∞—Å—è <-- —Å—Ç–æ—è–ª–∞\n',
 '- –ø—Ä–∏–≤–µ—Å—Ç–∏ <-- –Ω–∞–º–∞–≥–∞–ª–∞—Å—è\n',
 '+ –ø—Ä–∏–≤–µ—Å—Ç–∏ <-- —Å—Ç–æ—è–ª–∞\n',
 '  –¥–æ <-- –ª–∞–¥—É\n',
 '  –ª–∞–¥—É <-- –ø—Ä–∏–≤–µ—Å—Ç–∏\n',
 '  —Å–∫—É–π–æ–≤–¥–∂–µ–Ω–µ <-- –≤–æ–ª–æ—Å—Å—è\n',
 '- –≤–æ–ª–æ—Å—Å—è <-- –ø—Ä–∏–≤–µ—Å—Ç–∏\n',
 '- , <-- —Ä–æ–∑–ª—é—á–µ–Ω–∞\n',
 '+ –≤–æ–ª–æ—Å—Å—è <-- –ª–∞–¥—É\n',
 '+ , <-- —Ç–∏–º\n',
 '  –≤–∫—Ä–∞–π <-- —Ä–æ–∑–ª—é—á–µ–Ω–∞\n',
 '- —Ä–æ–∑–ª—é—á–µ–Ω–∞ <-- –≤–æ–ª–æ—Å—Å—è\n',
 '?               ^^^^^^^\n',
 '+ —Ä–æ–∑–ª—é—á–µ–Ω–∞ <-- ,\n',
 '?               ^\n',
 '- —Ç–∏–º <-- —Ä–æ–∑–ª—é—á–µ–Ω–∞\n',
 '+

–î–µ—è–∫—ñ –≤—ñ–¥–º—ñ–Ω–Ω–æ—Å—Ç—ñ –≤–∏–¥–Ω–æ —É –∑–≤'—è–∑–∫—É –∑ —Ä—ñ–∑–Ω–æ—é —Ç–æ–∫–µ–Ω—ñ–∑–∞—Ü—ñ—é (–º–æ–∂–ª–∏–≤–∏–º —Ä–æ–∑–≤'—è–∑–∫–æ–º –±—É–ª–æ –≤–∏–∫–æ—Ä–∏—Å—Ç–∞–Ω–Ω—è tokenize_uk —ñ –≤—ñ–¥–∫–ª—é—á–µ–Ω–Ω—è tokenize —É udpipe). –ù–∞ –¥–∞–Ω–∏—Ö —Ä–µ—á–µ–Ω–Ω—è—Ö uas –≤–∞—Ä—ñ—é—î—Ç—å—Å—è 0.5-0.6, 