In [8]:
#https://github.com/EmilStenstrom/conllu
#https://github.com/UniversalDependencies/UD_English-EWT
from conllu import parse, parse_tree
from collections import OrderedDict
import numpy as np

In [9]:
with open('../UD_Ukrainian-IU/uk_iu-ud-train.conllu') as f:
    c= f.read()
    trees= parse(c)

In [10]:
trees[0][0]

OrderedDict([('id', 1),
             ('form', 'У'),
             ('lemma', 'у'),
             ('upostag', 'ADP'),
             ('xpostag', 'Spsl'),
             ('feats', OrderedDict([('Case', 'Loc')])),
             ('head', 2),
             ('deprel', 'case'),
             ('deps', None),
             ('misc', OrderedDict([('Id', '0003')]))])

In [11]:
tree=trees[0]

In [12]:
from enum import Enum
class Action_type(Enum):
    Left = 1
    Right = 2
    Reduse = 3
    Shift = 4
    
class oracle:
    def __init__(self, tree):
        self.rel=[]
        self.stack=[OrderedDict([('id', 0),('form','ROOT'),("head",0),("lemma","root")])]
        self.queue=tree
        self.tree_len=len(tree)
        self.step_processor={Action_type.Left:self.Left,
                             Action_type.Shift:self.Shift,
                             Action_type.Right:self.Right,
                             Action_type.Reduse:self.Reduse}
        
    def Shift(self):
        self.stack.append(self.queue.pop(0))
        
    def Left(self):
        self.rel.append((self.stack.pop()['id'],self.queue[0]['id']))
        
    def Right(self):
        self.rel.append((self.queue[0]['id'],self.stack[-1]['id']))
        self.stack.append(self.queue.pop(0))
        
    def Reduse(self):
        self.stack.pop()
        
    def do_steps(self, debug=False):
        history_step1=-1
        histiry_step2=-1
        self.features =[]
        self.labels = []
        self.dep = []
        while self.stack or self.queue:
            top_stack = self.stack[-1] if len(self.stack) > 0 else None
            first_queue =  self.queue[0] if len(self.queue) > 0 else None
            step_type=self.get_action(top_stack,first_queue)
            self.labels.append(step_type)
            if (step_type==Action_type.Left):
                self.dep.append(top_stack['deprel'])
            if (step_type==Action_type.Right):
                self.dep.append(first_queue['deprel'])
            else:
                self.dep.append('-')

            self.features.append(extract_features(self.stack, self.queue, self.rel,history_step1,histiry_step2,self.tree_len))
            histiry_step2=history_step1
            history_step1=step_type.value
            if debug:
                self.info(step_type)
            self.step_processor[step_type]()
        return self.rel
    
    def get_action(self, stack,queue):
        if stack and not queue:
            return Action_type.Reduse
        if stack['head']==queue['id']:
             return Action_type.Left
        if stack['id']==queue['head']:
             return Action_type.Right
            
        if stack['id'] in  [i[0] for i in self.rel] and stack["id"] > queue["head"]:
             return Action_type.Reduse
        else:
            return Action_type.Shift
    
    def info(self, step_type):
        print('step: ', step_type)
        print('stack',  self.print_item(self.stack))
        print('queue',  self.print_item(self.queue))
        print('rel',  self.rel)
        print("====================================")

    def print_item(self, lst):
        return [i['form'] for i in lst]
    
    def get_relations(tree):
        return [(i['id'],i['head']) for i in tree]

In [13]:
import warnings
warnings.filterwarnings('ignore')

import sys
import os
import numpy as np
import gensim


model = gensim.models.KeyedVectors.load_word2vec_format('ubercorpus.lowercased.tokenized.word2vec.300d.bz2', binary=False)



In [14]:
# generate features
def extract_features(stack, queue, rel, history_step1, histiry_step2, sentense_len):
    #print("stack",stack)
    stack1=stack[-1] if len(stack)>0 else None
    stack2=stack[-2] if len(stack)>1 else None
    stack3=stack[-3] if len(stack)>2 else None
    queue1=queue[0] if len(queue)>0 else None
    queue2=queue[1] if len(queue)>1 else None
    queue3=queue[2] if len(queue)>2 else None
    queue4=queue[3] if len(queue)>4 else None
    
    features = dict()
    if len(stack) > 0:
        stack_top = stack[-1]
        s0_features = get_word_features(stack_top,'s0_',stack2,queue1, True,  )
    else:
        s0_features = get_word_features(None,'s0_',stack2,queue1 , True, )
    features = {**features, **s0_features}
    
    if len(stack) > 1:
        s1_features = get_word_features(stack[-2],'s1_',stack2,queue1 , True )
    else:
        s1_features = get_word_features(None,'s1_',stack2,queue1, True )
    features = {**features, **s1_features}
    
        
    if queue:
        queue_top = queue[0]
        q0_features = get_word_features(queue_top,'q0_',stack1,queue2, True )
    else: 
        q0_features = get_word_features(None,'q0_',stack1,queue2, True )
    features = {**features, **q0_features}
    
    if len(queue)>1:
        queue1 = queue[1]
        q1_features = get_word_features(queue1,'q1_',stack1,queue3)
    else: 
        q1_features = get_word_features(None,'q1_',stack1,queue3)
    features = {**features, **q1_features}
    
    if len(queue)>1:
        queue1 = queue[1]
        q2_features = get_word_features(queue1,'q2_',stack1,queue4)
    else: 
        q2_features = get_word_features(None,'q2_',stack1,queue4)
    features = {**features, **q2_features}
    
    features['history_step1']=history_step1
    features['histiry_step2']=histiry_step2

    try:
        features['is_first_word']=int(queue[0]["id"]==1)
        features['is_last_word']=int(queue[0]["id"]==sentense_len)
    except:
        features['is_first_word']=0
        features['is_last_word']=0
    features['stack_len']=len(stack)
    features['queue_len']=len(queue)
    features['rel_len']=len(rel)
    features['parrent_of']= sum([int(i[1]==stack_top['id']) for i in rel])
    if stack and queue:
        features["distance"] = queue[0]["id"] - stack[-1]["id"]
        features["word_index"] = queue[0]["id"]
    else:
        features["distance"]=-1
        features["word_index"]=-1
    return features

def get_word_features(node, prefix, left, right, add_embeding=False):
    try:
        left_tag = left['upostag']
    except:
        left_tag=''
    try:
        right_tag = right['upostag']
    except:
        right_tag=''
    try:
        node_tag = node['upostag']
    except:
        node_tag=''
    try:
        for key in node['feats']:
            features[prefix+'feats_'+key]=node['feats']['key']
    except:
        rer=1+1

    features = dict()
    
    try:
            word_vector=model.get_vector(word_info.normal_form.lower())
    except:
            word_vector=[0]*300
    for i in range(0,300):
        features[prefix+'vector+'+str(i)]=word_vector[i]
    if node:
        features[prefix+'lemma']=node['lemma'].lower()
        #features[prefix+'word']=node['form']
    
    features[prefix+'POS']=node_tag
    features[prefix+'left_item']= '{0}_{1}'.format(left_tag, node_tag)
    features[prefix+'item_right']= '{0}_{1}'.format(node_tag,right_tag)
    features[prefix+'left_right']= '{0}_{1}'.format(left_tag, right_tag)

    return features

# collect  features

In [15]:
import pandas as pd

In [16]:
def get_vectors_df(trees):
    from tqdm import tqdm
    features =[]
    labels = []
    dep_labels = []


    for index in tqdm(range(0,len(trees))):
        tree=trees[index]
        o = oracle(tree.copy())
        parsed_relations = o.do_steps()
        features=features + o.features
        labels=labels + o.labels
        dep_labels=dep_labels+o.dep
    
    return features,labels,dep_labels




vectorizer = None
def encode_feature_df(features,vectorizer):
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=False)
        vectorizer.fit(features)
    return vectorizer.transform(features)

In [17]:
from sklearn.feature_extraction import DictVectorizer

In [18]:
with open('../UD_Ukrainian-IU/uk_iu-ud-train.conllu') as f:
    c= f.read()
    trees= parse(c)

features_df,y_train, y_train_dep = get_vectors_df(trees)
vectorizer = DictVectorizer(sparse=True)
X_train=vectorizer.fit_transform(features_df)

100%|██████████| 4513/4513 [02:56<00:00, 25.64it/s]


In [19]:
y_train = [i.value for i in y_train]

In [20]:
#[col for col in features_df.columns.values if 'vector' not in col]

In [21]:
import gc
gc.collect()

815

# get test

In [22]:
with open('../UD_Ukrainian-IU/uk_iu-ud-test.conllu') as f:
    c= f.read()
    trees= parse(c)

features_df,y_test, y_test_dep = get_vectors_df(trees)
X_test= vectorizer.transform(features_df)
y_test = [i.value for i in y_test] 

100%|██████████| 783/783 [00:32<00:00, 24.17it/s]


In [23]:
import gc
gc.collect()

267

In [24]:
1+1

2

In [25]:
test_trees=trees

In [26]:
X_test.shape[1]

69837

In [27]:
1+1

2

# build model

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
clf = RandomForestClassifier(max_depth=19, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=19, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [23]:
predicted_test = clf.predict(X_test)

In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted_test, target_names=['Left','Right','Reduse','Shift']))

             precision    recall  f1-score   support

       Left       0.82      0.89      0.85      7352
      Right       0.76      0.78      0.77      7182
     Reduse       0.92      0.66      0.77      8370
      Shift       0.76      0.91      0.83      7757

avg / total       0.82      0.81      0.81     30661



# SGD - LogisticRegression

In [28]:
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [29]:
clf = LogisticRegression( random_state=0)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
predicted_test = clf.predict(X_train)

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_train, predicted_test, target_names=['Left','Right','Reduse','Shift']))

             precision    recall  f1-score   support

       Left       0.94      0.97      0.96     38446
      Right       0.91      0.92      0.91     34669
     Reduse       0.95      0.93      0.94     41165
      Shift       0.95      0.94      0.95     40429

avg / total       0.94      0.94      0.94    154709



In [32]:
predicted_test = clf.predict(X_test)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted_test, target_names=['Left','Right','Reduse','Shift']))

             precision    recall  f1-score   support

       Left       0.90      0.93      0.92      7352
      Right       0.80      0.82      0.81      7182
     Reduse       0.87      0.82      0.84      8370
      Shift       0.89      0.90      0.90      7757

avg / total       0.87      0.87      0.87     30661



# NN

In [31]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

Using TensorFlow backend.


In [32]:
from keras.layers import Dropout
dummy_y = np_utils.to_categorical(y_train)

In [33]:
np.unique(y_train)

array([1, 2, 3, 4])

In [34]:
X_test.shape[1]

5649

In [35]:
#NN
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(5649, input_dim=5649, activation='relu'))
    model.add(Dense(2300, input_dim=2300, activation='relu'))
    model.add(Dense(5, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [38]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=200, verbose=1)

In [39]:
history = estimator.fit( X_train, dummy_y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
predicted = estimator.predict(X_test)



In [44]:
print(classification_report(y_test, predicted, target_names=['Left','Right','Reduse','Shift']))

             precision    recall  f1-score   support

       Left       0.92      0.92      0.92      7352
      Right       0.84      0.78      0.81      7182
     Reduse       0.84      0.87      0.85      8370
      Shift       0.89      0.92      0.90      7757

avg / total       0.87      0.87      0.87     30661



# BB

In [40]:
gc.collect()
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(LogisticRegression(),max_samples=0.5, max_features=0.5, n_jobs=8)
bagging.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=0.5,
         max_samples=0.5, n_estimators=10, n_jobs=8, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [41]:
predicted_test = bagging.predict(X_train)

In [42]:
from sklearn.metrics import classification_report
print(classification_report(y_train, predicted_test, target_names=['Left','Right','Reduse','Shift']))

             precision    recall  f1-score   support

       Left       0.89      0.94      0.92     38446
      Right       0.84      0.86      0.85     34669
     Reduse       0.92      0.84      0.88     41165
      Shift       0.90      0.92      0.91     40429

avg / total       0.89      0.89      0.89    154709



In [45]:
predicted_test = bagging.predict(X_test)

In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted_test, target_names=['Left','Right','Reduse','Shift']))

             precision    recall  f1-score   support

       Left       0.88      0.93      0.90      7352
      Right       0.79      0.83      0.81      7182
     Reduse       0.89      0.77      0.83      8370
      Shift       0.87      0.90      0.89      7757

avg / total       0.86      0.86      0.86     30661



# Model Results: 

In [None]:
#LogisticRegression beter than SGD on 0.06
#LogisticRegression using words cause overfiting  (0.09) score on validation 0.87
#LogisticRegression using word emedings overfiting (0.02) but score also decrese on 0.02, score on validation 0.85
#NN using word emedings  score on validation 0.87, but
#BB on LogisticRegression 0.86

# feature importance

In [33]:
clf.coef_[0]

array([ 0.04088878,  0.10514956,  0.64145491, ...,  0.06549302,
       -0.30445747,  0.66288475])

In [42]:
np.array( list(vectorizer.vocabulary_.keys()))[clf.coef_[0].argsort()[::-1][:40]]

array(['s0_vector+22', 's0_left_item=VERB_PROPN', 's0_vector+100',
       's1_item_right=SCONJ_PART', 's0_left_item=ADV_PRON', 's1_vector+2',
       's1_left_right=VERB_INTJ', 's0_vector+14', 's0_left_item=ADV_ADP',
       's0_left_right=_ADP', 'q1_left_item=ADV_ADP',
       'q0_left_right=SCONJ_PROPN', 's1_item_right=PART_ADJ',
       'q2_left_right=VERB_AUX', 's1_item_right=PART_NUM',
       'q1_left_right=ADJ_DET', 's1_vector+3', 'q2_item_right=ADV_',
       's0_vector+38', 'q2_left_right=ADJ_ADV', 'q1_left_item=PRON_AUX',
       'q2_left_right=X_ADP', 's1_vector+123', 'q1_left_item=INTJ_VERB',
       's1_item_right=ADJ_PRON', 's0_vector+23', 'q2_vector+146',
       's0_vector+18', 's0_vector+129', 'q0_left_right=VERB_',
       's0_vector+7', 'q2_vector+56', 's0_vector+27', 's0_vector+57',
       's1_left_right=VERB_', 's0_left_item=AUX_NOUN',
       'q1_left_item=PRON_PART', 'q2_vector+265', 's1_item_right=_ADP',
       's0_item_right=AUX_PUNCT'], dtype='<U25')

# Calculate the unlabeled attachment score

In [51]:
def dep_parse(sentence, model):
    stack, queue, relations = [OrderedDict([('id', 0),('form','ROOT'),("head",0),("lemma","root")])], sentence[:], []
    history_step1=-1
    history_step2=-1
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue, relations,history_step1, history_step2, len(sentence))
            features = vectorizer.transform([features])
            action = clf.predict(features)[0]
            # actual parsing
            if action == 4: #:Action_type.Shift:
                stack.append(queue.pop(0))
            elif action == 3: #:Action_type.Reduse:
                stack.pop()
            elif action == 1: #Action_type.Left:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == 2: #Action_type.Right:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.", action)
            histiry_step2=history_step1
            history_step1=action
    return sorted(relations)

In [52]:
print([node["form"] for node in test_trees[1]])
print(dep_parse(test_trees[1], clf))
print([(node["id"], node["head"]) for node in test_trees[1]])

['Продавши', 'свій', 'шедевр', 'Меценатові', ',', 'еллінський', 'скульптор', 'споневажив', 'саме', 'мистецтво', ':', 'Ти', 'не', 'продався', ',', '—', 'гірше', '!']
[(1, 0), (2, 3), (3, 1), (4, 3), (5, 8), (6, 7), (7, 8), (8, 1), (9, 10), (10, 8), (11, 14), (12, 14), (13, 14), (14, 1), (15, 1), (16, 17), (17, 1), (18, 1)]
[(1, 8), (2, 3), (3, 1), (4, 1), (5, 1), (6, 7), (7, 8), (8, 0), (9, 10), (10, 8), (11, 14), (12, 14), (13, 14), (14, 8), (15, 17), (16, 17), (17, 14), (18, 14)]


In [53]:
gc.collect()

598

In [54]:
from tqdm import tqdm 
total, tp = 0, 0
for index in tqdm(range(0,len(test_trees))):
    tree=test_trees[index]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, clf)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))


  0%|          | 0/783 [00:00<?, ?it/s][A
  0%|          | 2/783 [00:00<00:47, 16.36it/s][A
  1%|          | 4/783 [00:00<00:57, 13.57it/s][A
  1%|          | 5/783 [00:00<01:16, 10.11it/s][A
  1%|          | 6/783 [00:00<01:22,  9.38it/s][A
  1%|          | 8/783 [00:00<01:22,  9.43it/s][A
  1%|          | 9/783 [00:01<01:27,  8.82it/s][A
  1%|▏         | 10/783 [00:01<01:30,  8.53it/s][A
  1%|▏         | 11/783 [00:01<01:34,  8.16it/s][A
  2%|▏         | 12/783 [00:01<01:52,  6.86it/s][A
  2%|▏         | 14/783 [00:01<01:47,  7.15it/s][A
  2%|▏         | 16/783 [00:02<01:40,  7.63it/s][A
  2%|▏         | 19/783 [00:02<01:28,  8.63it/s][A
  3%|▎         | 21/783 [00:02<01:23,  9.11it/s][A
  3%|▎         | 24/783 [00:02<01:17,  9.85it/s][A
  3%|▎         | 27/783 [00:02<01:12, 10.37it/s][A
  4%|▎         | 29/783 [00:02<01:16,  9.86it/s][A
  4%|▍         | 31/783 [00:03<01:16,  9.86it/s][A
  4%|▍         | 33/783 [00:03<01:23,  8.95it/s][A
  4%|▍         | 35/783 [0

Total: 14939
Correctly defined: 11055
UAS: 0.74





In [None]:
#UAS: 0.63 - on NN + VE
#UAS: 0.72 - on LR + VE
#UAS: 0.74 - on LR + VE + word
#UAS: 0.75 - on LR + word 