In [1]:
train_file = "../../../../NLP/dep/UD_Ukrainian-IU-master/uk_iu-ud-train.conllu"
test_file = "../../../../NLP/dep/UD_Ukrainian-IU-master/uk_iu-ud-test.conllu"

In [2]:
with open(train_file, encoding='utf-8') as f:
    train = f.read()

with open(test_file,  encoding='utf-8') as f:
    test = f.read()

In [3]:
from conllu import parse
from collections import deque
import collections
from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report

In [4]:
from enum import Enum
from collections import OrderedDict

In [5]:
train_trees_original = parse(train)

In [6]:
test_trees_original = parse(test)

In [7]:
def get_relations_from_tree(t):
    return [(x['id'],x['head']) for x in t]

In [8]:
def extract_features(stack, queue, rel):
    features = {}
    if (len(stack) > 0):
        features["s_0_form"] = stack[-1]["form"]
        features["s_0_lemma"] = stack[-1]["lemma"]
        features["s_0_postag"] = stack[-1]["upostag"]
        
    
    if len(stack) > 1:
        features["s_1_postag"] = stack[-2]["upostag"]
    
    if (len(queue) > 0):
        features["q_0_form"] = queue[0]["form"]
        features["q_0_lemma"] = queue[0]["lemma"]
        features["q_0_postag"] = queue[0]["upostag"]
    
    if (len(queue) > 1):
        features["q_1_form"] = queue[1]["form"]
        features["q_1_postag"] = queue[1]["upostag"]
    
    if (len(queue) > 2):
        features["q_1_postag"] = queue[2]["upostag"]
    
    if (len(queue) > 3):
        features["q_1_postag"] = queue[3]["upostag"]
    return features

def static_oracle(tree):
    stack = [collections.OrderedDict({'id':0, 'head':0, "form": "ROOT", "lemma":"", "upostag":""})]
    queue = deque(list(tree))
    actions = []
    features = []
    rel = []
    while(len(queue) > 0):
        head_s = stack[-1]
        head_q = queue[0]        
        features.append(extract_features(stack, queue, rel))
        if head_s["head"] == head_q['id']:
            actions.append("LEFT")
            rel.append((head_s['id'], head_q['id']))
            stack.pop()
        elif head_q["head"] == head_s["id"]:           
            actions.append("RIGHT")
            rel.append((head_q['id'], head_s['id']))
            stack.append(head_q)
            queue.popleft()            
        elif head_s["id"] in [x for (x,y) in rel] and head_s["id"] not in [x['head'] for x in queue]:
            actions.append("REDUCE")
            stack.pop()
        else:          
            actions.append("SHIFT")
            stack.append(head_q)
            queue.popleft()
    while(len(stack) > 0):
        features.append(extract_features(stack,queue,rel))        
        actions.append("REDUCE")
        stack.pop()    
    return actions,features

In [9]:
def check_cross(tree):
    ranges = [(min(x['id'], x['head']), max(x['id'], x['head'])) for x in tree]
    for x in tree:
        childI = x['id']
        headI = x['head']    
        for r1,r2 in ranges:        
            if (r1 < childI < r2):
                if headI < r1 or headI > r2:
                    return True
            if (r1 < headI < r2):
                if childI < r1 or childI > r2:
                    return True
    return False
train_trees = [t for t in train_trees_original if not check_cross(t)]
test_trees = [t for t in test_trees_original if not check_cross(t)]

In [10]:
def get_acts_with_features(trees):
    actions = []
    features = []
    for t in trees:
        a,f = static_oracle(t)
        actions += a
        features += f
    return actions,features

In [11]:
vec = DictVectorizer()

In [12]:
y_train,X_train_features = get_acts_with_features(train_trees)
# y_test,X_test_features = get_acts_with_features(test_trees)
X_train = vec.fit_transform(X_train_features)
# X_test = vec.transform(X_test_features)

In [13]:
logistic = linear_model.LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
predicted_train = logistic.predict(X_train)
predicted_test = logistic.predict(X_test)

In [26]:
print(classification_report(y_train, predicted_train))
print()
print(classification_report(y_test, predicted_test))

             precision    recall  f1-score   support

       LEFT       0.93      0.96      0.95     34862
     REDUCE       0.95      0.92      0.93     36150
      RIGHT       0.93      0.92      0.92     31945
      SHIFT       0.91      0.92      0.91     34862

avg / total       0.93      0.93      0.93    137819


             precision    recall  f1-score   support

       LEFT       0.87      0.89      0.88      6997
     REDUCE       0.84      0.77      0.81      7630
      RIGHT       0.80      0.85      0.82      6882
      SHIFT       0.77      0.78      0.77      6997

avg / total       0.82      0.82      0.82     28506



In [14]:
def parse(tree, classifier):
    stack = [collections.OrderedDict({'id':0, 'head':0, "form": "ROOT", "lemma":"", "upostag":""})]
    queue = deque(list(tree))
    rel = []
    LEFT = 0
    REDUCE = 1
    RIGHT = 2
    SHIFT = 3
    
    def get_valid(stack, queue, heads):
        forbidden = []
        if stack[-1]["id"] in heads or stack[-1]["id"] == 0:
            forbidden.append(LEFT)
        if queue[0]["id"] in heads:
            forbidden.append(RIGHT)
        if len(queue) == 1:            
            forbidden.append(SHIFT)
        if len(queue) == 1 and any([x["id"] != 0 and x["id"] not in heads for x in stack]): 
            forbidden.append(RIGHT) 
        if stack[-1]["id"] not in heads:
            forbidden.append(REDUCE)
        return [x for x in [LEFT,REDUCE,RIGHT,SHIFT] if x not in forbidden]
    
    def get_action(stack, queue, rel, valid):        
        predict_prob = list(classifier.predict_proba(vec.transform(extract_features(stack,queue,rel)))[0])
        ind =  predict_prob.index(max([x for  i, x in enumerate(predict_prob) if i in valid]))
        true_ind = predict_prob.index(max([x for  i, x in enumerate(predict_prob)]))
        return [LEFT,REDUCE,RIGHT,SHIFT][ind]
    
    while(queue):
        valid = get_valid(stack,queue,[x for (x,y) in rel])
        if valid == [] : break
        action = get_action(stack,queue,rel,valid)
        head_q = queue[0]
        if action == SHIFT:
            stack.append(head_q)
            queue.popleft()
            continue
        head_s = stack[-1]
        if action == LEFT:            
            rel.append((head_s['id'], head_q['id']))
            stack.pop()
        elif action == RIGHT:
            rel.append((head_q['id'], head_s['id']))
            stack.append(head_q)
            queue.popleft()            
        elif action == REDUCE:
            stack.pop()
    rel.sort()
    return rel

In [15]:
def test_parsing(trees, classifier):
    predicted = []
    true = []
    for i,t in enumerate(trees):
        p = parse(t, classifier)
        t = get_relations_from_tree(t)
        if len(p) != len(t):
            print("FAIL:",i)
            continue
        predicted += p
        true += t
    matched = sum([1 for i in range(len(predicted)) if predicted[i] == true[i]])
    print (matched/len(predicted))

In [16]:
test_parsing(train_trees, logistic)

0.8320086218510037


In [17]:
test_parsing(test_trees, logistic)

0.6522083723611212


In [18]:
test_parsing(train_trees_original, logistic)

0.8097952009374417


In [19]:
test_parsing(test_trees_original, logistic)

0.6533235156302296


In [20]:
from sklearn import svm

In [27]:
svc = svm.SVC(probability = True)

In [28]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
test_parsing(test_trees, svc)

0.09618848620217595


In [30]:
test_parsing(test_trees_original, svc)

0.09451770533502912


In [31]:
test_parsing(train_trees, svc)

0.08479650336042631


In [32]:
test_parsing(train_trees_original, svc)

0.08337106181256491
