In [210]:
from collections import OrderedDict
from conllu import parse
from enum import Enum
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import pymorphy2
import pickle
from sklearn.svm import SVC
import tokenize_uk
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [211]:
class Actions(Enum):
    SHIFT = 1
    REDUCE = 2
    LEFT = 3
    RIGHT = 4

    
def oracle(stack, top_queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT


def trace_actions(tree, morph, log=True, check_for_projective=True):
    """
    Повертає набір станів із фічами для дерева з корпусу, формує набір лейблів(дій), плюс тести
    """
    relations = []

    ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                        ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                        ('deps', None), ('misc', None)])

    states_with_labels = []

    stack, queue, relations = [ROOT], tree[:], []
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        if len(queue) > 0:
            buf0_morph = morph.parse(queue[0]['form'])[0]
        if len(stack) > 0:
            stk0_morph = morph.parse(stack[-1]['form'])[0]


        state = {'stk0_form': stack[-1]['form'] if len(stack) > 1 else 'None',
                 'buf0_form': queue[0]['form'] if len(queue) > 0 else 'None',
                 'buf1_form': queue[1]['form'] if len(queue) > 1 else 'None',
                 'buf0_lemma': queue[0]['lemma'] if len(queue) > 0 else 'None',
                 'stk0_lemma': stack[-1]['lemma'] if len(stack) > 0 else 'None',
                 'buf0_upostag': queue[0]['upostag'] if len(queue) > 0 else 'None',
                 'buf1_upostag': queue[1]['upostag'] if len(queue) > 1 else 'None',
                 'buf2_upostag': queue[2]['upostag'] if len(queue) > 2 else 'None',
                 'stk0_upostag': stack[-1]['upostag'] if len(stack) > 0 else 'None',
                 'stk1_upostag': stack[-2]['upostag'] if len(stack) > 1 else 'None',
                 'stk2_upostag': stack[-3]['upostag'] if len(stack) > 2 else 'None',
                 # кожна з наступних фіч трохи покращує результат
                 'buf3_upostag': queue[3]['upostag'] if len(queue) > 3 else 'None',
                 'buf0_morph_tag': str(buf0_morph.tag.POS) if len(queue) > 0 else 'None',
                 'stk0_morph_tag': str(stk0_morph.tag.POS) if len(stack) > 0 else 'None',
                 'stk0_morph_case': str(stk0_morph.tag.case) if len(stack) > 0 else 'None',
                 'buf0_morph_case': str(buf0_morph.tag.case) if len(queue) > 0 else 'None',
                 'buf0_morph_voice': str(buf0_morph.tag.voice) if len(queue) > 0 else 'None',
                 'stk0_morph_voice': str(stk0_morph.tag.voice) if len(stack) > 0 else 'None',
                 'buf0_morph_gender': str(buf0_morph.tag.gender) if len(queue) > 0 else 'None',
                 'stk0_morph_gender': str(stk0_morph.tag.gender) if len(stack) > 0 else 'None',
                 'buf0_morph_tense': str(buf0_morph.tag.tense) if len(queue) > 0 else 'None',
                 'stk0_morph_tense': str(stk0_morph.tag.tense) if len(stack) > 0 else 'None',
                 'buf0_morph_number': str(buf0_morph.tag.number) if len(queue) > 0 else 'None',
                 'stk0_morph_number': str(stk0_morph.tag.number) if len(stack) > 0 else 'None',
                 'buf0_morph_aspect': str(buf0_morph.tag.aspect) if len(queue) > 0 else 'None',
                 'stk0_morph_aspect': str(stk0_morph.tag.aspect) if len(stack) > 0 else 'None',
                 'stk1_morph_tag': str(morph.parse(stack[-2]['form'])[0].tag.POS) if len(stack) > 1 else 'None',

                 # наступні фічі лише зменшують точність
                 # 'buf0_istitle': str(queue[0]['form']).istitle() if len(queue) > 0 else False,
                 # 'stk0_istitle': str(stack[-1]['form']).istitle() if len(stack) > 0 else False,
                 # 'buf_stk_dif': (len(queue)-len(stack)),
                 # 'buf_len': len(queue),
                 # 'stk_len': len(stack),
                 # 'stk3_upostag': stack[-4]['upostag'] if len(stack) > 3 else 'None'

                 # 'buf0_morph_normal_form': str(buf0_morph.normal_form) if len(queue) > 0 else 'None',
                 # 'stk0_morph_normal_form': str(stk0_morph.normal_form) if len(stack) > 0 else 'None',

                 # 'buf0_morph_transitivity': str(buf0_morph.tag.transitivity) if len(queue) > 0 else 'None',
                 # 'stk0_morph_transitivity': str(stk0_morph.tag.transitivity) if len(stack) > 0 else 'None'
                 # 'buf0_morph_mood': str(buf0_morph.tag.mood) if len(queue) > 0 else 'None',
                 # 'stk0_morph_mood': str(stk0_morph.tag.mood) if len(stack) > 0 else 'None'
                 # 'buf1_morph_tag': str(morph.parse(queue[1]['form'])[0].tag.POS) if len(queue) > 1 else 'None',
                }
        states_with_labels.append((state, str(action)))
#         import ipdb; ipdb.set_trace()
        if log:
            print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
            print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
            print("Relations:", relations)
            print(action)
            print("========================")
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    gold_relations = [(node["id"], node["head"]) for node in tree]
    if log:
        print("Gold relations:")
        print(gold_relations)
        print("Retrieved relations:")
        print(sorted(relations))
    # check if tree is projective
    is_projective = False

    if check_for_projective:
        for i in range(0, len(gold_relations)-1):
            a = gold_relations[i]
            for j in range(0, len(gold_relations)-1):
                if ( i!=j ):
                    b = gold_relations[j]
                    # if (a[0] < b[0]) and (a[1] > b[1]) and (b[1] < a[0]):
                    if (a[0] < b[0]) and (b[0] < a[1]) and ((a[1] < b[1]) or (b[1] < a[0])):
                        is_projective = True
                        break

    return (states_with_labels, is_projective)

In [212]:
# завантаження дерев та формування данних для навчання та тесту моделі
path = '/home/ds/projects/nlp/UD_Ukrainian-IU'

with open(path + "/uk_iu-ud-train.conllu", "r") as f:
    data = f.read()
trees = parse(data)

with open(path + "/uk_iu-ud-test.conllu", "r") as f:
    data_test = f.read()
trees_test = parse(data_test)


morph = pymorphy2.MorphAnalyzer(lang='uk')
i=0
train_data = []
for tree in trees:
#     i=i+1
#     if i%100==0:
#         print(i)
    skip = False
    for n in tree:
        #check format exception
        if type(n['id']) != int:
            skip = True
    if not skip:
        states_with_labels, is_projective = trace_actions(tree, morph, log=False)
        if not is_projective:
            train_data = train_data + states_with_labels

test_data = []
for tree in trees_test:
#     i=i+1
#     if i%100==0:
#         print(i)
        
    skip = False

    for n in tree:
        if type(n['id']) != int:
            skip = True
    if not skip:
        states_with_labels, is_projective = trace_actions(tree,morph, log=False)
        if not is_projective:
            test_data = test_data + states_with_labels
            
x_train = [t[0] for t in train_data]
y_train = [t[1] for t in train_data]
x_test = [t[0] for t in test_data]
y_test = [t[1] for t in test_data]


# навчання моделі і тест моделі
vectorizer = DictVectorizer()

vec_train_tokens_transformed = vectorizer.fit_transform(x_train)

vec_test_tokens = vectorizer.transform(x_test)

clf = LogisticRegression()
clf.fit(vec_train_tokens_transformed, y_train)
predictions = clf.predict(vec_test_tokens)

print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions)) 






                precision    recall  f1-score   support

  Actions.LEFT       0.86      0.88      0.87      7640
Actions.REDUCE       0.84      0.79      0.81      8320
 Actions.RIGHT       0.76      0.80      0.78      7447
 Actions.SHIFT       0.86      0.87      0.86      7692

     micro avg       0.83      0.83      0.83     31099
     macro avg       0.83      0.83      0.83     31099
  weighted avg       0.83      0.83      0.83     31099

Accuracy: 0.8311521270780411


### Друга частина

In [213]:
DET = ['інакший', 'його', 'тамтой', 'чий', 'їх', 'інш.', 'деякий', 'ввесь', 'ваш',
       'ніякий', 'весь', 'інший', 'чийсь', 'жадний', 'другий', 'кожний',
       'такий', 'оцей', 'скілька', 'цей', 'жодний', 'все', 'кілька', 'увесь',
       'кожній', 'те', 'сей', 'ін.', 'отакий', 'котрий', 'усякий', 'самий',
       'наш', 'усілякий', 'будь-який', 'сам', 'свій', 'всілякий', 'всенький', 'її',
       'всякий', 'отой', 'небагато', 'який', 'їхній', 'той', 'якийсь', 'ин.', 'котрийсь',
       'твій', 'мій', 'це']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

# для приведення речень до формату дерев корпусу  
def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)

def convert_string_to_tree_format(text):
    """
    перетрорення речення в формат корпусу
    """
    tokens= [t['form'] for t in trees_test[0]]

    tokens = tokenize_uk.tokenize_uk.tokenize_words(text)
    i = 0
    tree = []
    for token in tokens:
        i = i + 1
        # pos = str(morph.parse(token)[0].tag.POS)
        word = morph.parse(token)[0]
        token_ = OrderedDict([('id', i), ('form', str(token)), ('lemma', str(morph.parse(token)[0].normal_form)),
                              ('upostag', normalize_pos(word)),
                        ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                        ('deps', None), ('misc', None)])
        tree.append(token_)
    return tree



def generate_relations_by_prediction(tree, log=True, check_for_projective=False):
    """
    для речення покроково будуємо розбір 
    """
    relations = []

    ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                        ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                        ('deps', None), ('misc', None)])

    states_with_labels = []

    stack, queue, relations = [ROOT], tree[:], []
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    while queue or stack:
#         action = oracle(stack if len(stack) > 0 else None,
#                         queue[0] if len(queue) > 0 else None,
#                         relations)
        if len(queue) > 0:
            buf0_morph = morph.parse(queue[0]['form'])[0]
        if len(stack) > 0:
            stk0_morph = morph.parse(stack[-1]['form'])[0]


        state = {'stk0_form': stack[-1]['form'] if len(stack) > 1 else 'None',
                 'buf0_form': queue[0]['form'] if len(queue) > 0 else 'None',
                 'buf1_form': queue[1]['form'] if len(queue) > 1 else 'None',
                 'buf0_lemma': queue[0]['lemma'] if len(queue) > 0 else 'None',
                 'stk0_lemma': stack[-1]['lemma'] if len(stack) > 0 else 'None',
                 'buf0_upostag': queue[0]['upostag'] if len(queue) > 0 else 'None',
                 'buf1_upostag': queue[1]['upostag'] if len(queue) > 1 else 'None',
                 'buf2_upostag': queue[2]['upostag'] if len(queue) > 2 else 'None',
                 'stk0_upostag': stack[-1]['upostag'] if len(stack) > 0 else 'None',
                 'stk1_upostag': stack[-2]['upostag'] if len(stack) > 1 else 'None',
                 'stk2_upostag': stack[-3]['upostag'] if len(stack) > 2 else 'None',
                 # кожна з наступних фіч трохи покращує результат
                 'buf3_upostag': queue[3]['upostag'] if len(queue) > 3 else 'None',
                 'buf0_morph_tag': str(buf0_morph.tag.POS) if len(queue) > 0 else 'None',
                 'stk0_morph_tag': str(stk0_morph.tag.POS) if len(stack) > 0 else 'None',
                 'stk0_morph_case': str(stk0_morph.tag.case) if len(stack) > 0 else 'None',
                 'buf0_morph_case': str(buf0_morph.tag.case) if len(queue) > 0 else 'None',
                 'buf0_morph_voice': str(buf0_morph.tag.voice) if len(queue) > 0 else 'None',
                 'stk0_morph_voice': str(stk0_morph.tag.voice) if len(stack) > 0 else 'None',
                 'buf0_morph_gender': str(buf0_morph.tag.gender) if len(queue) > 0 else 'None',
                 'stk0_morph_gender': str(stk0_morph.tag.gender) if len(stack) > 0 else 'None',
                 'buf0_morph_tense': str(buf0_morph.tag.tense) if len(queue) > 0 else 'None',
                 'stk0_morph_tense': str(stk0_morph.tag.tense) if len(stack) > 0 else 'None',
                 'buf0_morph_number': str(buf0_morph.tag.number) if len(queue) > 0 else 'None',
                 'stk0_morph_number': str(stk0_morph.tag.number) if len(stack) > 0 else 'None',
                 'buf0_morph_aspect': str(buf0_morph.tag.aspect) if len(queue) > 0 else 'None',
                 'stk0_morph_aspect': str(stk0_morph.tag.aspect) if len(stack) > 0 else 'None',
                 'stk1_morph_tag': str(morph.parse(stack[-2]['form'])[0].tag.POS) if len(stack) > 1 else 'None',
                }

        vec_state = vectorizer.transform(state)
        
        # ключовий момент! робимо передбачення наступної дії
        predictions = clf.predict(vec_state)
        
#         print(predictions)
#         import ipdb; ipdb.set_trace()
        action = predictions[0]
        states_with_labels.append((state, str(action)))
        if log:
            print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
            print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
            print("Relations:", relations)
            print(action)
            print("========================")
        if action == str(Actions.SHIFT):
            stack.append(queue.pop(0))
        elif action == str(Actions.REDUCE):
            stack.pop()
        elif action == str(Actions.LEFT):
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == str(Actions.RIGHT):
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    gold_relations = [(node["id"], node["head"]) for node in tree]
    if log:
        print("Gold relations:")
        print(sorted(gold_relations))
        print("Retrieved relations:")
        print(sorted(relations))
    # check if tree is projective
    is_projective = False

    if check_for_projective:
        for i in range(0, len(relations)-1):
            a = gold_relations[i]
            for j in range(0, len(relations)-1):
                if ( i!=j ):
                    b = gold_relations[j]
                    # if (a[0] < b[0]) and (a[1] > b[1]) and (b[1] < a[0]):
                    if (a[0] < b[0]) and (b[0] < a[1]) and ((a[1] < b[1]) or (b[1] < a[0])):
                        is_projective = True
                        break

    return (sorted(relations), sorted(gold_relations), is_projective)

In [228]:
tree = trees_test[7]
relations, gold_relations, is_projective = generate_relations_by_prediction(tree,log=False)
print('sentence: ', tree)
print('relations: ', relations)
print('gold_relations: ', gold_relations) # працює тільки коли речення прямо з корпусу
print('--------------\n')

sent = 'Це власне такі роздуми вплинули на фатальне рішення Антея.'
tree = convert_string_to_tree_format(sent)
relations, gold_relations, is_projective = generate_relations_by_prediction(tree,log=False)
print('sentence: ', sent)
print('relations: ', relations)
print('gold_relations: ', gold_relations)
print('--------------\n')

sent = 'Я люблю черепашок.'
tree = convert_string_to_tree_format(sent)
relations, gold_relations, is_projective = generate_relations_by_prediction(tree,log=False)
print('sentence: ', sent)
print('relations: ', relations)
print('gold_relations: ', gold_relations)
print('--------------\n')

sent = 'Я дуже сильно люблю черепашок.'
tree = convert_string_to_tree_format(sent)
relations, gold_relations, is_projective = generate_relations_by_prediction(tree,log=False)
print('sentence: ', sent)
print('relations: ', relations)
print('gold_relations: ', gold_relations)
print('--------------\n')

sent = 'Відома українська поетеса теж любить черепашок.'
tree = convert_string_to_tree_format(sent)
relations, gold_relations, is_projective = generate_relations_by_prediction(tree,log=False)
print('sentence: ', sent)
print('relations: ', relations)
print('gold_relations: ', gold_relations)
print('--------------\n')

sentence:  TokenList<Це, власне, такі, роздуми, вплинули, на, фатальне, рішення, Антея, .>
relations:  [(1, 4), (2, 3), (3, 4), (5, 4), (6, 8), (7, 8), (8, 5), (9, 8), (10, 4)]
gold_relations:  [(1, 5), (2, 3), (3, 4), (4, 5), (5, 0), (6, 8), (7, 8), (8, 5), (9, 8), (10, 5)]
--------------

sentence:  Це власне такі роздуми вплинули на фатальне рішення Антея.
relations:  [(1, 3), (2, 3), (3, 4), (5, 4), (7, 8), (9, 8), (10, 4)]
gold_relations:  [(1, None), (2, None), (3, None), (4, None), (5, None), (6, None), (7, None), (8, None), (9, None), (10, None)]
--------------

sentence:  Я люблю черепашок.
relations:  [(1, 2), (2, 0), (3, 2), (4, 2)]
gold_relations:  [(1, None), (2, None), (3, None), (4, None)]
--------------

sentence:  Я дуже сильно люблю черепашок.
relations:  [(1, 4), (2, 3), (3, 4), (4, 0), (5, 4), (6, 4)]
gold_relations:  [(1, None), (2, None), (3, None), (4, None), (5, None), (6, None)]
--------------

sentence:  Відома українська поетеса теж любить черепашок.
relation