# Extracting Input/Output matrices for training

In [1]:
from conll_reader import DependencyStructure, conll_reader
from collections import defaultdict
import copy
import sys
import keras
import numpy as np

Using TensorFlow backend.


In [2]:
class State(object):
    def __init__(self, sentence = []):
        self.stack = []
        self.buffer = []
        if sentence:  self.buffer = list(reversed(sentence))
        self.deps = set()

    def __repr__(self): return "{},{},{}".format(self.stack, self.buffer, self.deps)
    def shift(self): self.stack.append(self.buffer.pop())
    def left_arc(self, label): self.deps.add((self.buffer[-1], self.stack.pop(),label))

    def right_arc(self, label):
        parent = self.stack.pop()
        self.deps.add( (parent, self.buffer.pop(), label) )
        self.buffer.append(parent)

def apply_sequence(seq, sentence):
    state = State(sentence)
    for rel, label in seq:
        if rel == "shift":       state.shift()
        elif rel == "left_arc":  state.left_arc(label) 
        elif rel == "right_arc": state.right_arc(label) 
    return state.deps
   
class RootDummy(object):
    __repr__ = lambda s: "<ROOT>"
    def __init__(self): self.head, self.id, self.deprel = None, 0, None

def get_training_instances(dep_structure):
    deprels = dep_structure.deprels
    
    sorted_nodes = [k for k,v in sorted(deprels.items())]
    state = State(sorted_nodes)
    state.stack.append(0)

    childcount = defaultdict(int)
    for ident,node in deprels.items():
        childcount[node.head] += 1
 
    seq = []
    while state.buffer: 
        if not state.stack:
            seq.append((copy.deepcopy(state),("shift",None)))
            state.shift()
            continue
        if state.stack[-1] == 0:
            stackword = RootDummy() 
        else:
            stackword = deprels[state.stack[-1]]
        bufferword = deprels[state.buffer[-1]]
        if stackword.head == bufferword.id:
            childcount[bufferword.id]-=1
            seq.append((copy.deepcopy(state),("left_arc",stackword.deprel)))
            state.left_arc(stackword.deprel)
        elif bufferword.head == stackword.id and childcount[bufferword.id] == 0:
            childcount[stackword.id]-=1
            seq.append((copy.deepcopy(state),("right_arc",bufferword.deprel)))
            state.right_arc(bufferword.deprel)
        else: 
            seq.append((copy.deepcopy(state),("shift",None)))
            state.shift()
    return seq

In [3]:
dep_relations = ['tmod', 'vmod', 'csubjpass', 'rcmod', 'ccomp', 'poss', 'parataxis', 'appos', 'dep', 'iobj', 'pobj', 'mwe', 'quantmod', 'acomp', 'number', 'csubj', 'root', 'auxpass', 'prep', 'mark', 'expl', 'cc', 'npadvmod', 'prt', 'nsubj', 'advmod', 'conj', 'advcl', 'punct', 'aux', 'pcomp', 'discourse', 'nsubjpass', 'predet', 'cop', 'possessive', 'nn', 'xcomp', 'preconj', 'num', 'amod', 'dobj', 'neg','dt','det']

class FeatureExtractor(object):  
    def __init__(self, word_vocab_file, pos_vocab_file):
        self.word_vocab = self.read_vocab(word_vocab_file)        
        self.pos_vocab = self.read_vocab(pos_vocab_file)        
        self.output_labels = self.make_output_labels()

    def make_output_labels(self):
        labels = []
        labels.append(('shift',None))
    
        for rel in dep_relations:
            labels.append(("left_arc",rel))
            labels.append(("right_arc",rel))
        return dict((label, index) for (index,label) in enumerate(labels))

    def read_vocab(self,vocab_file):
        vocab = {}
        for line in vocab_file: 
            word, index_s = line.strip().split()
            index = int(index_s)
            vocab[word] = index
        return vocab     

    def get_input_representation(self, words, pos, state):
        #<CD> stands for any number (anything tagged with the POS tag CD)
        #<NNP> stands for any proper name (anything tagged with the POS tag NNP)
        #<UNK> stands for unknown words (in the training data, any word that appears only once)
        #<ROOT> is a special root symbol (the word associated with the word 0, which is initially placed on the stack of the dependency parser)
        #<NULL> is used to pad context windows.
        
        inpt = np.ones(6, dtype=int) * self.word_vocab['<NULL>']
        
        def getWord(idx):
            word = words[idx]
            if idx == 0:            word = '<ROOT>'
            elif pos[idx] == 'NNP': word = '<NNP>'
            elif pos[idx] == 'CD':  word = '<CD>'
            return self.word_vocab.get(word, self.word_vocab['<UNK>'])
        
        for n, idx in enumerate(state.stack[:-4:-1]):
            inpt[n] = getWord(idx)
        
        for n, idx in enumerate(state.buffer[:-4:-1]):
            inpt[3+n] = getWord(idx)   
        
        return inpt

    def get_output_representation(self, output_pair):
        return keras.utils.to_categorical(self.output_labels[output_pair], num_classes=91, dtype=int)

In [21]:
{v:k for (k, v) in extractor.word_vocab.items()}[1385]

'evaluates'

In [4]:
WORD_VOCAB_FILE = 'data/words.vocab'
POS_VOCAB_FILE = 'data/pos.vocab'
with open(WORD_VOCAB_FILE,'r') as word_vocab_f, open(POS_VOCAB_FILE,'r') as pos_vocab_f:
    extractor = FeatureExtractor(word_vocab_f, pos_vocab_f)

In [5]:
def get_training_matrices(extractor, in_file):
    inputs = []
    outputs = []
    count = 0 
    for dtree in conll_reader(in_file): 
        words = dtree.words()
        pos = dtree.pos()
        for state, output_pair in get_training_instances(dtree):
            tmp = extractor.get_input_representation(words, pos, state)
            inputs.append(tmp)

            tmp = extractor.get_output_representation(output_pair)
            outputs.append(tmp)
        if count%100 == 0:
            sys.stdout.write(".")
            sys.stdout.flush()
        count += 1
    sys.stdout.write("\n")
    return np.vstack(inputs), np.vstack(outputs)

In [7]:
print("Starting feature extraction... (each . represents 100 sentences)")
with open("data/train.conll", "r") as in_file:
    inputs, outputs = get_training_matrices(extractor, in_file)
    np.save("data/input_train.npy", inputs)
    np.save("data/target_train.npy", outputs)

Starting feature extraction... (each . represents 100 sentences)
...............................................................................................................................................................................................................................................................................................................................................................................................................


In [8]:
print("Starting feature extraction... (each . represents 100 sentences)")
with open("data/dev.conll", "r") as in_file:
    get_training_matrices(extractor, in_file)
    np.save("data/input_dev.npy", inputs)
    np.save("data/target_dev.npy", outputs)

Starting feature extraction... (each . represents 100 sentences)
...................................................


In [9]:
inputs

array([[    3,     4,     4,     1,     2,     1],
       [    1,     3,     4,     2,     1,     2],
       [    3,     4,     4,     1,     1,     2],
       ...,
       [12841,     3,     4,  4968,     4,     4],
       [    3,     4,     4, 12841,     4,     4],
       [    4,     4,     4,     3,     4,     4]])