In [7]:
import bz2
import os
import numpy as np
%matplotlib inline

import conllu
import pandas as pd
import numpy as np
import spacy
import en_core_web_lg
import collections
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, make_scorer, precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tqdm import tqdm, tqdm_notebook

In [3]:
from keras.models import Sequential
from keras.layers import Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
# Load embeddings
embedding_index = {}
f = bz2.open("/Users/serhiinechyporhuk/courses/ubercorpus.cased.tokenized.glove.300d.bz2")
for i,line in tqdm_notebook(enumerate(f), total=595119):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coefs
f.close()

HBox(children=(IntProgress(value=0, max=595119), HTML(value='')))




In [10]:
print("Number of word embeddings:", len(embedding_index))

Number of word embeddings: 595119


In [12]:
train = conllu.parse(open("uk_iu-ud-train.conllu").read())
test = conllu.parse(open("uk_iu-ud-test.conllu").read())
dev = conllu.parse(open("uk_iu-ud-dev.conllu").read())

In [13]:
word_index = {}
labels_uniq = set()
i = 0
for row in tqdm_notebook(train):
    for odict in row:
        if odict['form'] not in word_index:
            word_index[odict['form']] = i
            i+=1
        labels_uniq.add(odict['deprel'])
print("Length of dictionary:", len(word_index))
print("Unique labels:", len(labels_uniq))

HBox(children=(IntProgress(value=0, max=4513), HTML(value='')))


Length of dictionary: 23976
Unique labels: 49


In [14]:
labels = dict(zip(sorted(labels_uniq), range(len(labels_uniq))))

In [16]:
VECT_LENGTH=300
VOCAB_LENGTH = len(word_index)
INPUT_LENGTH = 2
OUTPUT_LENGTH = len(labels_uniq)

In [17]:
from keras.utils import to_categorical
def prepare(rows):
    X_train1 = []
    X_train2 = []
    y_train = []
    for row in tqdm_notebook(rows):
        for odict in row:
            if odict['head'] > 0 and odict['deprel'] in labels:
                child = odict['form']
                head = row[odict['head']-1]['form']
                child_idx = word_index.get(child, VOCAB_LENGTH)
                head_idx = word_index.get(head, VOCAB_LENGTH)
                X_train1.append(np.asarray([child_idx]),)
                X_train2.append(np.asarray([head_idx]))
                y_train.append(labels[odict['deprel']])
    return (np.asarray(X_train1), np.asarray(X_train2)), to_categorical(y_train)

(X_train1, X_train2), y_train = prepare(train)

HBox(children=(IntProgress(value=0, max=4513), HTML(value='')))




In [18]:
VOCAB_LENGTH, OUTPUT_LENGTH, len(X_train1)

(23976, 49, 70585)

In [19]:
embedding_matrix = np.zeros((VOCAB_LENGTH+1, VECT_LENGTH))
for word,i in word_index.items():
    embedding_vector = embedding_index.get(word.encode('utf8'))
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
embedding_matrix.shape

(23977, 300)

In [21]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [22]:
from keras.layers import Input, Embedding, Concatenate, Flatten
from keras.models import Model

w1 = Input(shape=(1,), dtype='int32', name='w1')
embed_w1 = Embedding(
    VOCAB_LENGTH+1,
    VECT_LENGTH,
    weights=[embedding_matrix],
    input_length=1,
    trainable=False
)(w1)

w2 = Input(shape=(1,), dtype='int32', name='w2')
embed_w2 = Embedding(
    VOCAB_LENGTH+1,
    VECT_LENGTH,
    weights=[embedding_matrix],
    input_length=1,
    trainable=False
)(w2)


x = Concatenate(axis=-1, name='concat')([embed_w1, embed_w2])
x = Dense(1000, activation='relu', name='dense1')(x)
x = Flatten()(x)
x = Dense(OUTPUT_LENGTH, activation='softmax', name='outputl')(x)

model = Model(inputs=[w1, w2], outputs=x)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc', f1])

In [23]:
model.fit(x={'w1': X_train1, 'w2': X_train2}, y=y_train, epochs=5,)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x134bec978>

In [24]:
(X_test1, X_test2), y_test = prepare(test)

HBox(children=(IntProgress(value=0, max=783), HTML(value='')))




In [25]:
model.evaluate(x={'w1': X_test1, 'w2': X_test2}, y=y_test)



[2.0349075026779637, 0.5992652253695627, 0.5993291275370201]