In [48]:
from gensim.models import KeyedVectors
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, TimeDistributed, Activation
import keras_metrics

from read_files import read_file
from read_files import debug
from read_files import files

import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

uk_vectors_file = './data/news.lowercased.tokenized.word2vec.300d'

uk_vectors = KeyedVectors.load_word2vec_format(uk_vectors_file, binary=False)

In [49]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [54]:
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [50]:
all_labels = ['dislocated', 'advcl', 'amod', 'obj', 'root', 'iobj', 'discourse', 'fixed', 'goeswith', 'det', 'list',
              'ccomp', 'flat', 'mark', 'obl', 'punct', 'parataxis', 'acl', 'nummod', 'cc', 'csubj',
              'compound', 'advmod', 'xcomp', 'appos', 'conj', 'expl', 'reparandum', 'aux', 'nmod', 'nsubj',
              'case', 'vocative', 'cop']

In [41]:
def filter_trees(trees):
    result = []
    for tree in trees:
        valid = True
        for node in tree:
            if type(node['head']) != int:
                valid = False
        if valid:
            result.append(tree)
    return result


def _get_embedding(node):
    word = node.get('form').lower()
    try:
        return uk_vectors.get_vector(word)
    except Exception as e:
        return None


def _get_encoded_label(lbl):
    if lbl not in all_labels:
        return None
    res = np.zeros(len(all_labels))
    index = all_labels.index(lbl)
    res[index] = 1
    return res


def _get_head_data(head_index, tree):
    head = tree[head_index] if head_index < len(tree) else None
    if not head:
        return None, None, None
    
    head_embedding = _get_embedding(head)
    head_label_enc = _get_encoded_label(head.get('deprel'))
    return head, head_embedding, head_label_enc


def _get_feature_vectors(word, tree):
    child = word
    result_vector = None
    child_embedding = _get_embedding(child)
    child_label_enc = _get_encoded_label(child.get('deprel'))
    if child_label_enc is None:
        return None, None
    
    head_1_index = child.get('head')
    if not head_1_index:
        return None, None
    head_1, head_embedding_1, head_label_enc_1 = _get_head_data(head_1_index, tree)
    if head_1 is None:
        return None, None
    
    head_2_index = head_1.get('head')
    if not head_2_index:
        return None, None
    _, head_embedding_2, head_label_enc_2 = _get_head_data(head_2_index, tree)
    
    if child_embedding is not None and head_embedding_1 is not None and head_label_enc_1 is not None and head_embedding_2 is not None and head_label_enc_2 is not None:
        result_vector = np.hstack((child_embedding, head_embedding_1, head_embedding_2))
        result_label = child_label_enc
        return result_label, result_vector
    else:
        return None, None


def _get_labels_features(filename):
    trees = read_file(filename)
    trees = filter_trees(trees)
    labels, features = [], []
    for tree in trees:
        tree_features = []
        tree_features_stepped = []
        tree_labels = []
        tree_labels_stepped = []

        for word in tree:
            label, feature = _get_feature_vectors(word, tree)
            if feature is not None and label is not None:
                tree_features.append(feature)
                tree_labels.append(label)

        if len(tree_features) > 0 and len(tree_labels) > 0:
            for i in range(2, len(tree_features)):
                step_1_features = tree_features[i-2]
                step_2_features = tree_features[i-1]
                step_3_features = tree_features[i]
                
                step_1_lbls = tree_labels[i-2]
                step_2_lbls = tree_labels[i-1]
                step_3_lbls = tree_labels[i]
                
                feats = np.vstack((step_3_features, step_2_features, step_1_features))
                lbls = np.vstack((step_3_lbls, step_2_lbls, step_1_lbls))
                
                tree_features_stepped.append(feats)
                tree_labels_stepped.append(lbls)
                
            if len(tree_features_stepped) > 0 and len(tree_labels_stepped) > 0:
                tree_features = np.dstack(tree_features_stepped)
                features.append(tree_features)

                tree_labels = np.dstack(tree_labels_stepped)
                labels.append(tree_labels)

    return labels, features

In [42]:
train_labels, train_features = _get_labels_features(files[0])
test_labels, test_features = _get_labels_features(files[1])
print(set(all_labels))

train_features = np.dstack(train_features)
test_features = np.dstack(test_features)

train_features = np.moveaxis(train_features, -1, 0)
test_features = np.moveaxis(test_features, -1, 0)

train_labels = np.dstack(train_labels)
test_labels = np.dstack(test_labels)

train_labels = np.moveaxis(train_labels, -1, 0)
test_labels = np.moveaxis(test_labels, -1, 0)

print('Features shape {}'.format(train_features.shape))  #(19136, 3, 900)
print('Labels shape {}'.format(train_labels.shape))  #(19136, 3, 34)

print('Num classes {}'.format(len(all_labels)))
print('Data preparation finished')

{'det', 'fixed', 'obj', 'obl', 'expl', 'case', 'list', 'aux', 'compound', 'reparandum', 'nummod', 'amod', 'conj', 'goeswith', 'cop', 'xcomp', 'dislocated', 'ccomp', 'flat', 'advmod', 'csubj', 'vocative', 'iobj', 'discourse', 'nsubj', 'punct', 'advcl', 'cc', 'parataxis', 'mark', 'appos', 'root', 'acl', 'nmod'}
Features shape (19136, 3, 900)
Labels shape (19136, 3, 34)
Num classes 34
Data preparation finished


In [55]:
model = Sequential()

timestep = 3
dim = 900

model.add(LSTM(512, input_shape=(timestep, dim), return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(256)))

model.add(Dense(units=len(all_labels), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[precision, recall, f1])

model.fit(train_features, train_labels, epochs=5, batch_size=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15ab4b048>

In [56]:
loss_and_metrics = model.evaluate(test_features, test_labels, batch_size=128)

print(model.metrics_names)
print(loss_and_metrics)

# ['precision', 'recall',   'f1']
# [     0.766,     0.73,   0.75]

['loss', 'precision', 'recall', 'f1']
[1.2100322953099407, 0.765577699043549, 0.7299275730309859, 0.7472582935074441]
