In [2]:
import codecs
import numpy as np
from itertools import groupby

EMBEDDINGS_DIM = 50

In [3]:
def load_dataset(dataset_file):
    """
    Load the dataset from tab-spaced (x, y, label)
    :param dataset_file:
    :return:
    """
    with codecs.open(dataset_file, 'r', 'utf-8') as f_in:
        lines = [tuple(line.strip().split('\t')) for line in f_in]
        dataset = {(x, y) : label for (x, y, label) in lines}
        
    return dataset

In [4]:
def get_paths(corpus, x, y):
    """
    Get the paths that connect x and y in the corpus
    :param corpus: the corpus' resource object
    :param x:
    :param y:
    :return:
    """
    x_to_y_paths = corpus.get_relations(x, y)
    y_to_x_paths = corpus.get_relations(y, x)
    paths = {corpus.get_path_by_id(path) : count for (path, count) in x_to_y_paths.iteritems()}
    paths.update({ corpus.get_path_by_id(path).replace('X/', '@@@').replace('Y/', 'X/').replace('@@@', 'Y/') : 
                  count for (path, count) in y_to_x_paths.iteritems() })
    
    return paths

In [5]:
def vectorize_path(path, lemma_index, pos_index, dep_index, dir_index):
    """
    Return a vector representation of the path
    :param path:
    :param lemma_index:
    :param pos_index:
    :param dep_index:
    :param dir_index:
    :return:
    """
    path_edges = [vectorize_edge(edge, lemma_index, pos_index, dep_index, dir_index) for edge in path.split('_')]
    if None in path_edges:
        return None
    else:
        return tuple(path_edges)

In [6]:
def vectorize_edge(edge, lemma_index, pos_index, dep_index, dir_index):
    """
    Return a vector representation of the edge: concatenate lemma/pos/dep and add direction symbols
    :param edge:
    :param lemma_index:
    :param pos_index:
    :param dep_index:
    :param dir_index:
    :return:
    """
    direction = ' '

    # Get the direction
    if edge.startswith('<') or edge.startswith('>'):
        direction = 's' + edge[0]
        edge = edge[1:]
    elif edge.endswith('<') or edge.endswith('>'):
        direction = 'e' + edge[-1]
        edge = edge[:-1]

    try:
        lemma, pos, dep = edge.split('/')
    except:
        return None

    return tuple([lemma_index.get(lemma, 0), pos_index[pos], dep_index[dep], dir_index[direction]])

In [15]:
data = load_dataset('dataset/lex/test.tsv')

#### Test lstm implemtenation