In [1]:
import os

import tensorflow as tf
import pandas as pd
import numpy as np
import gensim 
    
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from conllu import parse

In [2]:
dataset_path = '/Users/victor/Project/data/UD_Ukrainian-IU'
train_path = os.path.join(dataset_path, 'uk_iu-ud-train.conllu')
dev_path = os.path.join(dataset_path, 'uk_iu-ud-dev.conllu')
test_path = os.path.join(dataset_path, 'uk_iu-ud-test.conllu')

with open(train_path, 'r') as f: 
    content = f.read()
    train = parse(content)

with open(dev_path, 'r') as f: 
    content = f.read()
    dev = parse(content)

with open(test_path, 'r') as f: 
    content = f.read()
    test = parse(content)

In [3]:
def to_record(data): 
    records = []
    for sentence in data: 
        references = {rec['id']: rec for rec in sentence}
        for word in sentence: 
            head_id = word['head']
            if head_id == 0:
                head = 'root'
            else:
                head = references[word['head']]['lemma']
            records.append({'child': word['lemma'].lower(), 
                            'head': head.lower(),
                            'y': word['deprel'].split(':')[0]})
    return records
            

In [4]:
train_df = pd.DataFrame(to_record(train))
dev_df = pd.DataFrame(to_record(dev))
test_df = pd.DataFrame(to_record(test))

df = pd.concat([train_df, dev_df, test_df])

In [5]:
le = LabelEncoder()
oe = OneHotEncoder()

y = le.fit_transform(df['y'].values).reshape(-1, 1)
oe.fit(y)

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [6]:
classes = np.unique(pd.concat([train_df, dev_df, test_df])['y'])

In [7]:
num_classes = len(classes)

In [8]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('/Users/victor/Downloads/fiction.lowercased.tokenized.word2vec.300d', binary=False)

In [9]:
vocab = dict()
for i, w in enumerate(w2v_model.wv.index2entity):
    vocab[w] = i + 1

vocabulary_size = len(w2v_model.wv.index2entity)
embedding_size = 300


  from ipykernel import kernelapp as app


In [10]:
embedding_matrix = np.zeros((vocabulary_size + 1, embedding_size))

for k, v in vocab.items(): 
    embedding_matrix[v] = w2v_model[k]

In [11]:
def word2index(df): 
    X_train_child = []
    X_train_head = []
    for _, row in df.iterrows(): 
        child_idx = vocab.get(row['child'], None)
        child_idx = child_idx if child_idx else 0
        head_idx = vocab.get(row['head'], None)
        head_idx = head_idx if head_idx else 0
        
        X_train_child.append(child_idx)
        X_train_head.append(head_idx)
    return np.expand_dims(np.array(X_train_child), axis=1), np.expand_dims(np.array(X_train_head), axis=1)

X_train_child, X_train_head = word2index(train_df)
X_dev_child, X_dev_head = word2index(dev_df)
X_test_child, X_test_head = word2index(test_df)

In [12]:
y_train = le.transform(train_df['y']).reshape(-1, 1)
y_train = oe.transform(y_train).toarray()

y_dev = le.transform(dev_df['y']).reshape(-1, 1)
y_dev = oe.transform(y_dev).toarray()

y_test = le.transform(test_df['y']).reshape(-1, 1)
y_test = oe.transform(y_test).toarray()

In [22]:
l2_reg_lambda = 0

num_epochs = 20
batch_size = 128


with tf.Graph().as_default():
    input_child = tf.placeholder(tf.int32, shape=[None, 1], name='input_child')
    input_head = tf.placeholder(tf.int32, shape=[None, 1], name='input_head')
    input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y')
    dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')

    l2_loss = tf.constant(0.0)
    
    with tf.name_scope('Embedding'):
        W = tf.get_variable('word_embeddings', shape=[vocabulary_size + 1, embedding_size], 
                            initializer=tf.constant_initializer(embedding_matrix), 
                            trainable=False)
        child_embedded_chars = tf.nn.embedding_lookup(W, input_child)
        child_embedded_chars_reshaped = tf.reshape(child_embedded_chars, shape=[-1, 300])

        head_embedded_chars = tf.nn.embedding_lookup(W, input_head)
        head_embedded_chars_reshaped = tf.reshape(head_embedded_chars, shape=[-1, 300])

        embeddings = tf.concat([child_embedded_chars_reshaped, head_embedded_chars_reshaped], 
                               axis=1)
    
    with tf.name_scope('Hidden1'): 
        hidden_1 = tf.layers.dense(embeddings, units=1024, activation=tf.nn.relu, 
                                   kernel_initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_AVG"), 
                                   name='hidden1')
        hidden_1 = tf.nn.dropout(hidden_1, dropout_keep_prob)
        
    with tf.name_scope('Output'): 
        W = tf.get_variable('W', shape=[1024, num_classes], 
                            initializer=tf.contrib.layers.xavier_initializer())
        b = tf.get_variable('b', shape=[num_classes], 
                            initializer=tf.zeros_initializer())
        l2_loss += tf.nn.l2_loss(W)
        l2_loss += tf.nn.l2_loss(b)
        scores = tf.nn.xw_plus_b(hidden_1, W, b, name='scores')
        predictions = tf.argmax(scores, axis=1, name='predictionss')
        y_pred = tf.one_hot(predictions, depth=num_classes)
    
    with tf.name_scope('loss'): 
        losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=input_y)
        loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
        
    with tf.name_scope('accuracy'): 
        correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')
        
    global_step = tf.Variable(0, name='global_step', trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-4)
    grad_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grad_and_vars, global_step=global_step)

    
    session_conf = tf.ConfigProto(allow_soft_placement=False, log_device_placement=True)
    with tf.Session(config=session_conf) as session: 
        session.run(tf.global_variables_initializer())
        
        for epoch in range(num_epochs): 
            num_batches = int(X_train_head.shape[0] / batch_size)
            shuffle_indices = np.random.permutation(num_batches)
            current_index = 0
            for batch in range(num_batches): 
                idx = shuffle_indices[current_index:current_index + batch_size]
                feed_dict = {
                    input_child: X_train_child[idx], 
                    input_head: X_train_head[idx],
                    input_y: y_train[idx],
                    dropout_keep_prob: 0.3
                }
                _, step, l, a = session.run([train_op, global_step, loss, accuracy], feed_dict=feed_dict)

                current_step = tf.train.global_step(session, global_step)
                if current_step % 100 == 0 and current_step != 0:
                    feed_dict = {
                        input_child: X_dev_child,
                        input_head: X_dev_head,
                        input_y: y_dev,
                        dropout_keep_prob: 1
                    }
                    step, l, a = session.run([global_step, loss, accuracy], feed_dict=feed_dict)
                    print('Dev, epoch: {}, step: {}, loss: {}, accuracy: {}'.format(epoch, step, l, a))
                    
                current_index += batch_size
        
        feed_dict = {
            input_child: X_dev_child,
            input_head: X_dev_head,
            input_y: y_dev, 
            dropout_keep_prob: 1
        }
        y_p, a = session.run([y_pred, accuracy], feed_dict=feed_dict)
        print('Dev F1 score: ', f1_score(y_dev, y_p, average='macro'), 'Accuracy: ', a)
        print(classification_report(y_dev, y_p))

        feed_dict = {
            input_child: X_test_child,
            input_head: X_test_head,
            input_y: y_test, 
            dropout_keep_prob: 1
        }
        y_p, a = session.run([y_pred, accuracy], feed_dict=feed_dict)
        print('Test F1 score: ', f1_score(y_test, y_p, average='macro'), 'Accuracy: ', a)
        print(classification_report(y_test, y_p))        

Dev, epoch: 0, step: 100, loss: 3.4550044536590576, accuracy: 0.1279529482126236
Dev, epoch: 0, step: 200, loss: 3.4549901485443115, accuracy: 0.1279529482126236
Dev, epoch: 0, step: 300, loss: 3.4549901485443115, accuracy: 0.1279529482126236
Dev, epoch: 0, step: 400, loss: 3.4549901485443115, accuracy: 0.1279529482126236
Dev, epoch: 0, step: 500, loss: 3.4549901485443115, accuracy: 0.1279529482126236
Dev, epoch: 1, step: 600, loss: 2.884950637817383, accuracy: 0.3638993203639984
Dev, epoch: 1, step: 700, loss: 2.7456514835357666, accuracy: 0.38704079389572144
Dev, epoch: 1, step: 800, loss: 2.745649814605713, accuracy: 0.38704079389572144
Dev, epoch: 1, step: 900, loss: 2.745649814605713, accuracy: 0.38704079389572144
Dev, epoch: 1, step: 1000, loss: 2.745649814605713, accuracy: 0.38704079389572144
Dev, epoch: 1, step: 1100, loss: 2.745649814605713, accuracy: 0.38704079389572144
Dev, epoch: 2, step: 1200, loss: 2.496044874191284, accuracy: 0.44016969203948975
Dev, epoch: 2, step: 1300

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Dev F1 score:  0.23858094607202268 Accuracy:  0.51296884
             precision    recall  f1-score   support

          0       0.64      0.11      0.19       184
          1       0.12      0.05      0.07       119
          2       0.66      0.61      0.64       484
          3       0.41      0.63      0.50       846
          4       0.00      0.00      0.00        71
          5       1.00      0.11      0.19        19
          6       0.99      0.94      0.97       945
          7       0.89      0.78      0.83       358
          8       0.00      0.00      0.00        50
          9       0.00      0.00      0.00        64
         10       0.06      0.03      0.04       475
         11       0.66      0.75      0.70        57
         12       0.00      0.00      0.00        52
         13       0.00      0.00      0.00         0
         14       0.71      0.88      0.79       240
         15       0.47      0.49      0.48       139
         16       0.00      0.00      0.0