## Класифікація залежностей із використанням нейронних мереж

In [1]:
import conllu
import gzip
import bz2
from collections import OrderedDict
import string
import numpy as np
import pickle
import gensim
from sklearn.metrics import classification_report
from tokenize_uk import tokenize_words

In [3]:
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten, LSTM, Dropout, Embedding, Bidirectional
from keras.layers.merge import Concatenate, concatenate
from keras import Input, Model
from keras.models import Sequential
from keras.utils import to_categorical

Я спробував різні векторні представлення, вони дають схожі результати, але LexVec трохи кращі, ніж word2vec та GloVe.

In [None]:
# code for parsing text file with vectors
"""
embeddings = {}
with bz2.open('/mnt/hdd/Data/NLP/ubercorpus.lowercased.lemmatized.word2vec.300d.bz2', 'rb') as vf:
    count = 0
    for line in vf:
        if count == 0:
            count += 1
            continue
        count += 1
        line = line.decode()
        values = line.split()
        if not values:
            continue
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coefs

pickle.dump(embeddings, open('/mnt/hdd/Data/NLP/glove_lemma_embeddings.pkl', 'wb'))"""

In [4]:
word_vectors = pickle.load(open('/mnt/hdd/Data/NLP/lex_lemma_embeddings.pkl', 'rb'))

In [5]:
fname = 'uk_iu-ud-train.conllu.gz'
with gzip.open(fname, 'rb') as f:
    raw_train = f.read().decode()

fname3 = 'uk_iu-ud-test.conllu.gz'
with gzip.open(fname3, 'rb') as f3:
    raw_test = f3.read().decode()
    
train_data = conllu.parse(raw_train)
test_data = conllu.parse(raw_test)

# fix an error in test set:
for sent in test_data:
    for w in sent:
        if w['deprel'] == 'dep':
            w['deprel'] = 'det'

Щоб використати для парсера не тільки слова, але і частини мов (як у https://cs.stanford.edu/~danqi/papers/emnlp2014.pdf), можна натренувати векторні представлення частин мови на основі цього ж самого датасету:

In [6]:
pos_sequences = []
for sent in train_data:
    seq = [w['upostag'] for w in sent]
    pos_sequences.append(seq)

In [7]:
pos_model = gensim.models.Word2Vec(pos_sequences, size=300, window=3, min_count=1, iter=100)
pos_vectors = {k:pos_model.wv[k] for k in pos_model.wv.vocab}

Я спробував два підходи: простіший, при якому ми збираємо з "золотих" дерев всю інформацію про залежності між словами, окрім власне типу залежності, і потім визначаємо тип залежності класифікатором; і складніший, у якому ми використовуємо класифікатор у процесі парсингу для визначення наступної дії (SHIFT, REDUCE, LEFT+залежність, RIGHT+залежність). При цьому використовується інформація з 3 слів стеку та 3 слів черги. Функції для першого підходу позначені як "simple".

У обох випадках я зробив одну модель з використанням тільки представлень слів, і іншу з представленнями слів і частин мови.

У мене був лишився код парсера ще з домашки, тому в деяких місцях я просто адаптував уже готові функції замість писання нових (хоча це зробило б код читабельнішим).

In [8]:
class LabeledParser():
    """
    Dependency parser using static oracle,
    for labeled dependencies.
    """
    
    def __init__(self, word_vectors, pos_vectors, train_data):
        self.ROOT = OrderedDict({'form': 'ROOT', 'id': 0, 'head': -1, 
                                 'lemma': 'ROOT', 'upostag': 'UNK',
                                 'deprel': 'root'})
        self.train_data = train_data
        self.word_vectors = word_vectors
        self.pos_vectors = pos_vectors
        # setting unknown word/POS to average vector seems to work a bit better than 0 vector
        self.avg_vec = np.average([v for (k,v) in self.word_vectors.items()], axis=0)
        self.avg_pos_vec = np.average([v for (k,v) in self.pos_vectors.items()], axis=0)
        # initialize indices for word vectors (only for words present in train data)
        self.dictionary = self.build_dictionary(self.train_data)
        # initialize indices for labels
        self.label_dict = self.build_label_dict(self.train_data)
        self.label_dict_s = self.build_label_dict_simple(self.train_data)
        # initialize indices for POS vectors
        self.pos_index = {pos[0]:i for (i, pos) 
                          in enumerate(self.pos_vectors.items())}
        self.pos_vectors.update({'UNK': self.avg_pos_vec})
        self.pos_index.update({'UNK': len(self.pos_index)})
    
    def strip_colon(self, deprel):
        """
        Strip the second part of deprel (after colon).
        """
        if ':' in deprel:
            new = deprel.split(':')[0].strip()
            return new
        else:
            return deprel
        
    def make_action(self, action, stack, queue, relations):
        """
        Applies action to the stack, the queue, and the relations.
        """
        w1 = stack[-1]
        w2 = queue[0]
        action = action.split('_')[0]
        if action == 'SHIFT':
            stack.append(queue.pop(0))
        elif action == 'REDUCE':
            stack.pop()
        elif action == 'LEFT':
            relations.append((w1['id'], self.strip_colon(w1['deprel']), w2['id']))
            stack.pop()
        elif action == 'RIGHT':
            relations.append((w2['id'], self.strip_colon(w2['deprel']), w1['id']))
            stack.append(queue.pop(0))
        return stack, queue, relations
    
    def apply_actions(self, tree, train=False, pos=False):
        """
        Produce dependencies for the tree with known dependencies.
        If train=True, also get features and labels in the process.
        If pos=True, also get POS indices as features.
        """
        stack = [self.ROOT]
        queue = tree[:]
        relations = []
        label_indices = []
        data_indices = []
        pos_indices = []
        while stack and queue:
            top_stack = stack[-1] if stack else None
            top_queue = queue[0] if queue else None
            action = self.oracle(top_stack, top_queue, relations)
            if train:
                label_indices.append(self.label_dict[action])
                data_indices.append(self.get_indices(stack, queue))
                pos_indices.append(self.get_pos(stack, queue))
            stack, queue, relations = self.make_action(action, stack, queue, relations)
        if train and not pos:
            return relations, data_indices, label_indices
        if train and pos:
            return relations, data_indices, label_indices, pos_indices
        return relations
    
    def get_relations(self, tree):
        """
        For 'simple' model.
        """
        relations = []
        for w in tree:
            w1 = w['lemma']
            deprel = self.strip_colon(w['deprel'])
            if w['head'] == 0:
                w2 = 'root'
            else:
                w2 = tree[w['head']-1]['lemma']
            relations.append((w1, deprel, w2))
        return relations
    
    def get_relation_data(self, tree):
        """
        For 'simple' model.
        """
        relations = self.get_relations(tree)
        label_indices = []
        word_indices = []
        for r in relations:
            w1, deprel, w2 = r
            label_indices.append(self.label_dict_s[deprel])
            w1i = self.word_to_index(w1)
            w2i = self.word_to_index(w2)
            word_indices.append((w1i, w2i))
        return word_indices, label_indices
    
    def get_relation_pos_data(self, tree):
        """
        POS data from tree for 'simple' model.
        """
        relations = []
        for w in tree:
            w1 = w['upostag']
            if w['head'] == 0:
                w2 = 'UNK'
            else:
                w2 = tree[w['head']-1]['upostag']
            relations.append((w1, w2))
        pos_indices = []
        for r in relations:
            w1, w2 = r
            w1i = self.pos_index.get(w1)
            w2i = self.pos_index.get(w2)
            pos_indices.append((w1i, w2i))
        return pos_indices
    
    def word_to_index(self, word):
        try:
            wi = self.dictionary[word]
        except:
            wi = self.dictionary['unk']
        return wi
    
    def pad_stack(self, stack, length=3):
        """
        Make sure stack is always of same length.
        """
        top_stack = stack[-length:]
        diff = length-len(top_stack)
        unk_w = {'lemma': 'unk', 'upostag': 'UNK'}
        padded = diff*[unk_w] + [w for w in top_stack]
        return padded
    
    def append_queue(self, queue, length=3):
        """
        Make sure queue is always of same length.
        """
        top_queue = queue[:length]
        diff = length-len(top_queue)
        unk_w = {'lemma': 'unk', 'upostag': 'UNK'}
        appended = [w for w in top_queue] + diff*[unk_w]
        return appended

    def get_indices(self, stack, queue):
        """
        Get word indices for stack and queue.
        """
        top_stack = self.pad_stack(stack, 3)
        top_queue = self.append_queue(queue, 3)
        res = [self.word_to_index(w['lemma'].lower()) for w in top_stack] +\
              [self.word_to_index(w['lemma'].lower()) for w in top_queue]
        return np.array(res)
    
    def get_pos(self, stack, queue):
        """
        Get POS indices for stack and queue.
        """
        top_stack = self.pad_stack(stack, 3)
        top_queue = self.append_queue(queue, 3)
        res = [self.pos_index.get(w['upostag']) for w in top_stack] +\
              [self.pos_index.get(w['upostag']) for w in top_queue]
        return np.array(res)
    
    def oracle(self, top_stack, top_queue, relations):
        """
        Returns the right action given the state
        of the stack, the queue, and the relations.
        """
        if top_stack and not top_queue:
            return 'REDUCE'
        elif top_queue['head'] == top_stack['id']:
            return 'RIGHT' + '_' + self.strip_colon(top_queue['deprel'])
        elif top_stack['head'] == top_queue['id']:
            return 'LEFT' + '_' + self.strip_colon(top_stack['deprel'])
        elif (top_stack['id'] in [i[0] for i in relations] and 
             top_queue['head'] < top_stack['id']):
            return 'REDUCE'
        else:
            return 'SHIFT'
    
    def build_dictionary(self, train_data):
        """
        Create word dictionary using keras tools.
        """
        tokenizer = Tokenizer(oov_token='unk', lower=True)
        texts = []
        for tree in train_data:
            text = ' '.join([w['lemma'] for w in tree])
            texts.append(text)
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
        padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=12)
        dictionary = tokenizer.word_index
        return dictionary
    
    def build_label_dict_simple(self, train_data):
        """
        Label dict without parser actions ('LEFT' etc)
        """
        all_deps = []
        for t in train_data:
            for w in t:
                all_deps.append(self.strip_colon(w['deprel']))
        all_deps = set(all_deps)
        label_index = {k:i for (i,k) in enumerate(list(all_deps))}
        return label_index
    
    def build_label_dict(self, train_data):
        """
        Dictionary of labels + actions ('LEFT_case' etc)
        """
        all_deps = []
        for t in train_data:
            for w in t:
                all_deps.append(self.strip_colon(w['deprel']))
        all_deps = set(all_deps)
        left = ['LEFT_'+dep for dep in list(all_deps)]
        right = ['RIGHT_'+dep for dep in list(all_deps)]
        labels = left + right + ['SHIFT', 'REDUCE']
        label_index = {k:i for (i,k) in enumerate(labels)}
        return label_index
        
    def word_emb_layer(self, input_len=2):
        """
        Create Embedding layer for word embeddings.
        """
        if not self.dictionary:
            return None
        embedding_matrix = np.zeros((len(self.dictionary) + 1, 300))
        for word, i in self.dictionary.items():
            embedding_vector = self.word_vectors.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
            else:
                embedding_matrix[i] = self.avg_vec
        embedding_layer = Embedding(len(self.dictionary) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=input_len,
                            trainable=False)
        return embedding_layer
    
    def pos_emb_layer(self, input_len=2):
        """
        Create Embedding layer for POS embeddings.
        """
        emb_matrix = np.zeros((len(self.pos_index)+1, 300))
        for pos, i in self.pos_index.items():
            emb_vector = self.pos_vectors.get(pos)
            emb_matrix[i] = emb_vector
        emb_layer = Embedding(len(self.pos_index)+1,
                              300,
                              weights=[emb_matrix],
                              input_length=input_len,
                              trainable=False)
        return emb_layer
    
    def get_data_indices(self, data):
        """
        Get the input data (word indices).
        """
        x_indices, y_indices = [], []
        for tree in data:
            rels, data_i, label_i = self.apply_actions(tree, train=True)
            x_indices.extend(data_i)
            y_indices.extend(label_i)
        return np.array(x_indices), np.array(y_indices)
    
    def get_data_indices_simple(self, data):
        """
        Input data for 'simple' model.
        """
        x_indices, y_indices = [], []
        for tree in data:
            data_i, label_i = self.get_relation_data(tree)
            x_indices.extend(data_i)
            y_indices.extend(label_i)
        return np.array(x_indices), np.array(y_indices)
    
    def get_pos_indices(self, data):
        """
        Get the input data (POS indices).
        """
        pos_indices = []
        for tree in data:
            rels, data_i, label_i, pos_i = self.apply_actions(tree, train=True, pos=True)
            pos_indices.extend(pos_i)
        return np.array(pos_indices)
    
    def get_pos_indices_simple(self, data):
        """
        POS inputs for 'simple' model.
        """
        pos_indices = []
        for tree in data:
            pos_i = self.get_relation_pos_data(tree)
            pos_indices.extend(pos_i)
        return np.array(pos_indices)
    
    def NN_simple(self, test_data):
        """
        Model predicting only type of relation, 
        using word embeddings.
        """
        model = Sequential()
        model.add(self.word_emb_layer(2))
        model.add(Dense(1024, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(1024, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(1024, activation='relu'))
        model.add(Flatten())
        model.add(Dense(len(self.label_dict_s), activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['acc'])
        x_train, y_train = self.get_data_indices_simple(self.train_data)
        y_train = to_categorical(y_train)
        x_test, y_test = self.get_data_indices_simple(test_data)
        y_test = to_categorical(y_test)
        model.fit(x_train, y_train, epochs=5, batch_size=128,
                  validation_data=(x_test, y_test))
        predicted = model.predict(x_test)
        return predicted
    
    def NN_simple_pos(self, test_data):
        """
        Model predicting only type of relation, using both
        word and POS embeddings.
        """
        model_word_in = Input(shape=(2, ))
        model_word_emb = self.word_emb_layer(2)(model_word_in)
        model_word = Model(model_word_in, model_word_emb)
        model_pos_in = Input(shape=(2, ))
        model_pos_emb = self.pos_emb_layer(2)(model_pos_in)
        model_pos = Model(model_pos_in, model_pos_emb)
        concatenated = concatenate([model_word_emb, model_pos_emb])
        x = Bidirectional(LSTM(1024, dropout=0.2, recurrent_dropout=0.2,
                 return_sequences=True))(concatenated)
        x = Dense(1024, activation='relu')(concatenated)
        x = Dropout(0.5)(x)
        x = Dense(1024, activation='relu')(x)
        x = Flatten()(x)
        out = Dense(len(self.label_dict_s), activation='softmax', name='output_layer')(x)
        
        merged_model = Model([model_word_in, model_pos_in], out)
        merged_model.compile(loss='categorical_crossentropy', optimizer='adam', 
                             metrics=['accuracy'])
        x_train, y_train = self.get_data_indices_simple(self.train_data)
        pos_train = self.get_pos_indices_simple(self.train_data)
        y_train = to_categorical(y_train)
        x_test, y_test = self.get_data_indices_simple(test_data)
        pos_test = self.get_pos_indices_simple(test_data)
        y_test = to_categorical(y_test)
        merged_model.fit([x_train, pos_train], y_train, batch_size=128, epochs=5,
                         validation_data=([x_test, pos_test], y_test))
        predicted = merged_model.predict([x_test, pos_test])
        return predicted
    
    def NN(self, test_data):
        """
        Model predicting next action for transition parser 
        with LSTM and Dense layers and taking word embeddings as input.
        """
        model = Sequential()
        model.add(self.word_emb_layer(6))
        model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2,
                 return_sequences=True)))
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(128, activation='relu'))
        model.add(Flatten())
        model.add(Dense(len(self.label_dict), activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
        x_train, y_train = self.get_data_indices(self.train_data)
        y_train = to_categorical(y_train)
        x_test, y_test = self.get_data_indices(test_data)
        y_test = to_categorical(y_test)
        model.fit(x_train, y_train, epochs=5, batch_size=128,
                  validation_data=(x_test, y_test))
        return model
    
    def NN_pos(self, test_data):
        """
        Model predicting next action for transition parser 
        with LSTM and Dense layers and taking both word embeddings 
        and POS embeddings as input.
        """
        model_word_in = Input(shape=(6, ))
        model_word_emb = self.word_emb_layer(6)(model_word_in)
        model_word = Model(model_word_in, model_word_emb)
        model_pos_in = Input(shape=(6, ))
        model_pos_emb = self.pos_emb_layer(6)(model_pos_in)
        model_pos = Model(model_pos_in, model_pos_emb)
        concatenated = concatenate([model_word_emb, model_pos_emb])
        x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2,
                 return_sequences=True))(concatenated)
        x = Dense(128, activation='relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(128, activation='relu')(x)
        x = Flatten()(x)
        out = Dense(len(self.label_dict), activation='softmax')(x)
        model = Model([model_word_in, model_pos_in], out)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
        x_train, y_train = self.get_data_indices(self.train_data)
        pos_train = self.get_pos_indices(self.train_data)
        y_train = to_categorical(y_train)
        x_test, y_test = self.get_data_indices(test_data)
        pos_test = self.get_pos_indices(test_data)
        y_test = to_categorical(y_test)
        model.fit([x_train, pos_train], y_train, epochs=5, batch_size=128,
                  validation_data=([x_test, pos_test], y_test))
        return model

In [9]:
lp = LabeledParser(word_vectors, pos_vectors, train_data)

Перша проста модель - кілька Dense шарів із дропаутами.

In [10]:
predicted_simple = lp.NN_simple(test_data)

Train on 75098 samples, validate on 14939 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Друга проста модель використовує функціональний API бібліотеки keras, тому що в sequential API складніше об'єднати два набори векторних представлень (слова і POS-теги), що покращує точність до 80%. Також тут є шар LSTM - він дав додаткове покращення, незважаючи на відсутність великих послідовностей у даних.

In [11]:
predicted_simple_pos = lp.NN_simple_pos(test_data)

Train on 75098 samples, validate on 14939 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Точність на тестовій вибірці понад 83% - найвищий результат, який у мене виходив. Детальніше можна подивитись у таблиці - деякі типи залежностей, які легко передбачити через частину мови, мають практично 100% точність.

In [12]:
lab_dict = {v:k for (k,v) in lp.label_dict_s.items()}
pred_indices = np.argmax(predicted_simple_pos, axis=1)
pred_labels = [lab_dict[i] for i in pred_indices]
true_labels = [lp.strip_colon(w['deprel']) for tree in test_data for w in tree]
print(classification_report(true_labels, pred_labels))

             precision    recall  f1-score   support

        acl       0.77      0.91      0.83       176
      advcl       0.31      0.28      0.29       149
     advmod       0.93      0.99      0.96       644
       amod       0.98      1.00      0.99      1459
      appos       0.21      0.04      0.06       105
        aux       1.00      0.93      0.96        27
       case       1.00      1.00      1.00      1373
         cc       0.99      0.99      0.99       554
      ccomp       0.60      0.31      0.41        83
   compound       0.84      0.46      0.59        81
       conj       0.55      0.32      0.40       787
        cop       0.98      1.00      0.99        80
      csubj       0.78      0.66      0.71        47
        det       0.99      0.98      0.98       447
  discourse       0.89      0.91      0.90       183
       expl       0.45      0.77      0.57        13
      fixed       0.82      0.74      0.78        31
       flat       0.73      0.59      0.65   

  'precision', 'predicted', average, warn_for)


Перша "складна" модель також використовує LSTM, але без векторних представлень POS-тегів.

In [13]:
lp.NN(test_data)

Train on 138889 samples, validate on 27802 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.models.Sequential at 0x7fc98dc33f28>

Нарешті, остання модель використовує вектори POS-тегів і це дозволяє досягнути 79% точності на тестовій вибірці.

In [14]:
model_pos = lp.NN_pos(test_data)

Train on 138889 samples, validate on 27802 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


У цьому випадку є сенс окремо порахувати labeled attachment score, тому що цей парсер неідеальний навіть для золотих дерев, тому накопичення помилок парсера та помилок класифікатора може дати гірший результат:

In [15]:
def predict_tree(sentence, model, parser):
    stack, queue, relations = [parser.ROOT], sentence[:], []
    lab_dict = {v:k for (k,v) in parser.label_dict.items()}
    while queue and stack:
        word_i = np.array([parser.get_indices(stack, queue)])
        pos_i = np.array([parser.get_pos(stack, queue)])
        action_i = model.predict([word_i, pos_i])
        pred_index = np.argmax(action_i, axis=1)[0]
        action_dep = lab_dict[pred_index]
        action = action_dep.split('_')[0]
        if len(action_dep.split('_')) > 1:
            deprel = action_dep.split('_')[1]
        if action == 'SHIFT':
            stack.append(queue.pop(0))
        elif action == 'REDUCE':
            stack.pop()
        elif action == 'LEFT':
            relations.append((stack[-1]["id"], deprel, queue[0]["id"]))
            stack.pop()
        elif action == 'RIGHT':
            relations.append((queue[0]["id"], deprel, stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return sorted(relations)

total, tp = 0, 0
for tree in test_data:
    golden = [(node["id"], lp.strip_colon(node["deprel"]), node["head"]) for node in tree]
    predicted = predict_tree(tree, model_pos, lp)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("LAS:", round(tp/total, 2))

Total: 14939
Correctly defined: 9618
LAS: 0.64
