In [5]:
import sys
import argparse
import os
import glob
import time
import math
import json


from collections import Counter
from copy import deepcopy

import numpy as np
import theano
import theano.tensor as T

sys.setrecursionlimit(100000000)
theano.config.floatX = 'float32'

Data Loaders

In [6]:
class Loader(object):
    def __init__(self, argv):
        self.argv = argv

    def load(self, **kwargs):
        raise NotImplementedError

    @staticmethod
    def load_data(fn):
        with gzip.open(fn, 'rb') as gf:
            return pickle.load(gf)

    @staticmethod
    def load_key_value_format(fn):
        data = []
        with open(fn, 'r') as f:
            for line in f:
                key, value = line.rstrip().split()
                data.append((key, int(value)))
        return data

    @staticmethod
    def load_hdf5(path):
        return h5py.File(path, 'r')

    def load_txt_from_dir(self, dir_path, file_prefix):
        file_names = get_file_names_in_dir(dir_path + '/*')
        file_names = [fn for fn in file_names
                      if os.path.basename(fn).startswith(file_prefix)
                      and fn.endswith('txt')]
        return [self.load(path=fn) for fn in file_names]

    def load_hdf5_from_dir(self, dir_path, file_prefix):
        file_names = get_file_names_in_dir(dir_path + '/*')
        file_names = [fn for fn in file_names
                      if os.path.basename(fn).startswith(file_prefix)
                      and fn.endswith('hdf5')]
        return [self.load_hdf5(fn) for fn in file_names]

class Conll12Loader(Loader):

    def load(self, path, data_size=1000000, is_test=False):
        if path is None:
            return []

        corpus = []
        sent = []

        with open(path) as f:
            for line in f:
                elem = [l for l in line.rstrip().split()]
                if len(elem) > 10:
                    if is_test:
                        sent.append(elem[:11])
                    else:
                        sent.append(elem)
                elif len(elem) == 0:
                    corpus.append(sent)
                    sent = []
                if len(corpus) >= data_size:
                    break
        return corpus

Data Preprocessing

In [7]:
HYPH = u'-'
UNK = u'UNKNOWN'


class Vocab(object):
    def __init__(self):
        self.i2w = []
        self.w2i = {}

    def add_word(self, word):
        if word not in self.w2i:
            new_id = self.size()
            self.i2w.append(word)
            self.w2i[word] = new_id

    def get_id(self, word):
        return self.w2i.get(word)

    def get_id_or_unk_id(self, word):
        if word in self.w2i:
            return self.w2i.get(word)
        return self.w2i.get(UNK)

    def get_and_add_id(self, word):
        self.add_word(word)
        return self.w2i.get(word)

    def get_word(self, w_id):
        return self.i2w[w_id]

    def has_key(self, word):
        return word in self.w2i

    def size(self):
        return len(self.i2w)

In [8]:
def save_key_value_format(fn, keys, values):
    assert len(keys) == len(values)
    if type(values[0]) is not str:
        values = map(lambda v: str(v), values)
    with open(fn + '.txt', 'w') as f:
        for key, value in zip(keys, values):
            f.write("%s\t%s\n" % (key, value))

def load_key_value_format(fn):
    data = []
    with open(fn, 'r') as f:
        for line in f:
            key, value = line.rstrip().split()
            data.append((key, int(value)))
    return data

In [9]:
class Sent(object):
    def __init__(self, sent, is_test=True):
        self.words = self._make_words(sent=sent, is_test=is_test)

        self.forms = [word.form for word in self.words]
        self.strings = [word.string for word in self.words]
        self.marks = self._set_marks(self.words)
        self.props = [word.prop for word in self.words]

        self.prd_indices = self._set_prd_indices(self.marks)
        self.prd_forms = [self.forms[i] for i in self.prd_indices]
        self.prd_bio_labels = self._set_prd_bio_labels(self.props)
        self.has_prds = True if len(self.prd_indices) > 0 else False

        self.n_words = len(sent)
        self.n_prds = len(self.prd_indices)

        self.word_ids = None
        self.mark_ids = None
        self.elmo_emb = None
        self.bio_label_ids = None
        self.span_triples = None
        self.span_triples_with_null = None

    def _make_words(self, sent, is_test=True):
        return [self._make_word(line, is_test) for line in sent]

    @staticmethod
    def _make_word(line, is_test=True):
        raise NotImplementedError

    def _set_marks(self, words):
        raise NotImplementedError

    @staticmethod
    def _make_bio_labels(prop):
        """
        :param prop: 1D: n_words; elem=bracket label
        :return: 1D: n_words; elem=BIO label
        """
        labels = []
        prev = None
        for arg in prop:
            if arg.startswith('('):
                if arg.endswith(')'):
                    prev = arg.split("*")[0][1:]
                    label = 'B-' + prev
                    prev = None
                else:
                    prev = arg[1:-1]
                    label = 'B-' + prev
            else:
                if prev:
                    label = 'I-' + prev
                    if arg.endswith(')'):
                        prev = None
                else:
                    label = 'O'
            labels.append(label)
        return labels

    @staticmethod
    def _set_prd_indices(marks):
        return [i for i, mark in enumerate(marks) if mark != HYPH]

    def _set_prd_bio_labels(self, props):
        """
        :param props: 1D: n_words, 2D: n_prds
        :return: 1D: n_prds, 2D: n_words
        """
        props = map(lambda p: p, zip(*props))
        return [self._make_bio_labels(prop) for prop in props]

    def set_word_ids(self, vocab_word):
        self.word_ids = array(str_to_id(sent=self.forms,
                                        vocab=vocab_word,
                                        unk=UNK))

    def set_mark_ids(self):
        mark_ids = [[0 for _ in range(self.n_words)] for _ in range(self.n_prds)]
        for i, prd_index in enumerate(self.prd_indices):
            mark_ids[i][prd_index] = 1
        self.mark_ids = array(mark_ids)

    def set_label_ids(self, vocab_label):
        """
        :param vocab_label: Vocab (BIO labels); e.g. B-A0, I-A0
        """
        assert len(self.prd_indices) == len(self.prd_bio_labels)
        label_ids = []
        for prd_index, props in zip(self.prd_indices, self.prd_bio_labels):
            y = str_to_id(sent=props, vocab=vocab_label, unk='O')
            label_ids.append(y)
        self.bio_label_ids = array(label_ids)

    def set_elmo_emb(self, elmo_emb):
        """
        :param elmo_emb: 1D: n_layers, 2D: n_words, 3D: dim
        """
        elmo_emb = np.asarray(elmo_emb)
        elmo_emb = elmo_emb.transpose((1, 0, 2))
        assert len(elmo_emb) == self.n_words
        self.elmo_emb = elmo_emb

    def set_span_triples(self, vocab_label):
        """
        :param vocab_label: Vocab (labels); e.g. A0, A1
        """
        triples = []
        for bio_labels in self.prd_bio_labels:
            prd_triples = []
            for (label, i, j) in self._get_spans(bio_labels):
                r = vocab_label.get_id(label)
                prd_triples.append((r, i, j))
            triples.append(prd_triples)
        self.span_triples = triples

    @staticmethod
    def _get_spans(bio_labels):
        """
        :param bio_labels: 1D: n_words; elem=bio label
        :return: 1D: n_spans; elem=[label, i, j]
        """
        spans = []
        span = []
        for i, label in enumerate(bio_labels):
            if label[-2:] == '-V':
                continue
            if label.startswith('B-'):
                if span:
                    spans.append(span)
                span = [label[2:], i, i]
            elif label.startswith('I-'):
                if span:
                    if label[2:] == span[0]:
                        span[2] = i
                    else:
                        spans.append(span)
                        span = [label[2:], i, i]
                else:
                    span = [label[2:], i, i]
            else:
                if span:
                    spans.append(span)
                span = []
        if span:
            spans.append(span)
        return spans

    def set_span_triples_with_null(self, n_labels):
        assert len(self.span_triples) == len(self.prd_indices)
        triples_with_null = []
        for prd_index, spans in zip(self.prd_indices, self.span_triples):
            used_labels = [r for (r, i, j) in spans]
            null_spans = [(r, prd_index, prd_index)
                          for r in range(n_labels)
                          if r not in used_labels]
            triples = spans + null_spans
            triples.sort(key=lambda s: s[0])
            triples_with_null.append(triples)
        self.span_triples_with_null = triples_with_null

class Conll12Sent(Sent):
    @staticmethod
    def _make_word(line, is_test=False):
        return Word(form=line[3],
                    mark=line[6],
                    sense=line[7],
                    prop=line[11:-1] if is_test is False else [])

    def _set_marks(self, words):
        return list(map(lambda w: w.mark if w.sense != HYPH else HYPH, words))

class Word(object):
    def __init__(self, form, mark, sense, prop):
        self.form = form.lower()
        self.string = form
        self.mark = mark
        self.sense = sense
        self.prop = prop

In [10]:
class Preprocessor(object):
    def __init__(self, argv):
        self.argv = argv
        self.data_type = argv.data_type

    @staticmethod
    def make_vocab_word(word_list):
        vocab_word = Vocab()
        vocab_word.add_word(UNK)
        for w in word_list:
            vocab_word.add_word(w)
        return vocab_word

    def make_and_save_vocab_label(self,
                                  sents,
                                  vocab_label_init=None,
                                  save=False,
                                  load=False):
        argv = self.argv

        if load and argv.load_label:
            label_key_value = load_key_value_format(argv.load_label)
            vocab_label = make_vocab_from_ids(label_key_value)
        else:
            vocab_label = self.make_vocab_label(sents=sents,
                                                vocab_label_init=vocab_label_init)
        if save:
            if argv.output_dir:
                dir_name = argv.output_dir
            else:
                dir_name = 'output'
            if argv.output_fn:
                file_name = '/label_ids.' + argv.output_fn
            else:
                file_name = '/label_ids'

            fn = dir_name + file_name
            values, keys = map(lambda x: x, zip(*enumerate(vocab_label.i2w)))
            save_key_value_format(fn=fn, keys=keys, values=values)

        return vocab_label

    def make_sents(self, corpus):
        """
        :param corpus: 1D: n_sents, 2D: n_words
        :return: 1D: n_sents
        """
        if len(corpus) == 0:
            return []

        if self.data_type == 'conll05':
            column = 6
            gen_sent = Conll05Sent
        else:
            column = 12
            gen_sent = Conll12Sent

        is_test = True if len(corpus[0][0]) < column else False
        return [gen_sent(sent, is_test) for sent in corpus]

    @staticmethod
    def split_x_and_y(batches, index=-1):
        """
        :param batches: 1D: n_batches, 2D: batch_size; elem=(x, m, y)
        :param index: split column index
        :return 1D: n_batches, 2D: batch_size; elem=(x, m)
        :return 1D: n_batches, 2D: batch_size; elem=y
        """
        x = []
        y = []
        for batch in batches:
            x.append(batch[:index])
            y.append(batch[index])
        return x, y

    def make_batches(self,
                     samples,
                     is_valid_data=False,
                     shuffle=True):
        """
        :param samples: 1D: n_samples, 2D: [x, m, y]
        :param is_valid_data: boolean
        :param shuffle: boolean
        :return 1D: n_batches, 2D: batch_size; elem=[x, m, y]
        """
        if shuffle:
            np.random.shuffle(samples)
            samples.sort(key=lambda sample: len(sample[0]))

        batches = []
        batch = []
        prev_n_words = len(samples[0][0])

        for sample in samples:
            n_words = len(sample[0])
            if len(batch) == self.argv.batch_size or prev_n_words != n_words:
                batches.append(self._make_one_batch(batch, is_valid_data))
                batch = []
                prev_n_words = n_words
            batch.append(sample)

        if batch:
            batches.append(self._make_one_batch(batch, is_valid_data))

        if shuffle:
            np.random.shuffle(batches)

        for batch in batches:
            yield batch

    @staticmethod
    def _make_one_batch(batch, is_valid_data):
        raise NotImplementedError

    @staticmethod
    def make_batch_per_sent(sents):
        """
        :param sents: 1D: n_sents; Sent()
        :return 1D: n_sents, 2D: n_prds; elem=[x, m]
        """
        batches = []
        for sent in sents:
            x = []

            x_word_ids = sent.word_ids
            if x_word_ids is not None:
                x.append(x_word_ids)

            x_elmo_emb = sent.elmo_emb
            if x_elmo_emb is not None:
                x.append(x_elmo_emb)

            batch = list(map(lambda m: x + [m], sent.mark_ids))
            batches.append(list(map(lambda b: b, zip(*batch))))

        return batches

    @staticmethod
    def set_sent_config(sents, elmo_emb, vocab_word, vocab_label):
        raise NotImplementedError

    @staticmethod
    def make_samples(sents, is_valid_data=False):
        raise NotImplementedError

    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        raise NotImplementedError

class SpanPreprocessor(Preprocessor):
    def make_vocab_label(self,
                         sents,
                         vocab_label_init=None):
        if len(sents) == 0:
            return None

        if vocab_label_init:
            vocab_label = deepcopy(vocab_label_init)
        else:
            vocab_label = Vocab()
            if self.argv.data_type == 'conll05':
                core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"]
            else:
                core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"]
            for label in core_labels:
                vocab_label.add_word(label)

        bio_labels = []
        for sent in sents:
            for props in sent.prd_bio_labels:
                bio_labels += props
        cnt = Counter(bio_labels)
        bio_labels = [(w, c) for w, c in cnt.most_common()]

        for label, count in bio_labels:
            if not label.endswith('-V') and len(label) > 1:
                vocab_label.add_word(label[2:])

        return vocab_label

    @staticmethod
    def set_sent_config(sents, elmo_emb, vocab_word, vocab_label):
        for index, sent in enumerate(sents):
            sent.set_mark_ids()
            if vocab_word:
                sent.set_word_ids(vocab_word)
            if elmo_emb:
                sent.set_elmo_emb(elmo_emb[str(index)])
            if vocab_label:
                sent.set_span_triples(vocab_label)
                sent.set_span_triples_with_null(vocab_label.size())
        return sents

    @staticmethod
    def make_samples(sents, is_valid_data=False):
        samples = []

        for sent in sents:
            x = []

            x_word_ids = sent.word_ids
            if x_word_ids is not None:
                x.append(x_word_ids)

            x_elmo_emb = sent.elmo_emb
            if x_elmo_emb is not None:
                x.append(x_elmo_emb)

            if is_valid_data:
                triples = sent.span_triples
            else:
                triples = sent.span_triples_with_null

            assert len(sent.mark_ids) == len(triples)
            for m, spans in zip(sent.mark_ids, triples):
                # spans: 1D: n_spans, 2D: (r, i, j)
                samples.append(x + [m, spans])

        return samples

    @staticmethod
    def _make_one_batch(batch, is_valid_data):
        if is_valid_data:
            return list(map(lambda b: b, zip(*batch)))

        b = []
        y = []
        n_words = len(batch[0][0])
        for b_index, sample in enumerate(batch):
            b.append(sample[:-1])
            y_tmp = []
            for (r, i, j) in sample[-1]:
                span_index = span_to_span_index(i, j, n_words)
                y_tmp.append([b_index, r, span_index])
            y += y_tmp

        x = list(map(lambda b_i: b_i, zip(*b)))

        return x + [y]

Evaluator

In [11]:
def f_score(crr_total, p_total, r_total):
    precision = crr_total / p_total if p_total > 0 else 0.
    recall = crr_total / r_total if r_total > 0 else 0.
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0.
    return precision, recall, f1

def concat_c_spans_from_spans(spans, vocab_label):
    spans = [[vocab_label.get_word(l), i, j] for (l, i, j) in spans]
    labels = [l for (l, i, j) in spans]
    c_indices = [index for index, (l, i, j) in enumerate(spans) if l.startswith('C')]
    non_ant_c_spans = []

    for c_index in c_indices:
        c_span = spans[c_index]
        label = c_span[0][2:]
        if label in labels:
            spans[labels.index(label)].extend(c_span[1:])

    concated_spans = [span for i, span in enumerate(spans) if i not in c_indices]
    spans = concated_spans + non_ant_c_spans
    return spans

class Evaluator(object):
    def __init__(self, argv):
        self.argv = argv

    def f_score(self, y_true, y_pred, vocab_label):
        """
        :param y_true: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
        :param y_pred: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
        """
        correct, p_total, r_total = self.metrics(y_true=y_true,
                                                 y_pred=y_pred,
                                                 vocab_label=vocab_label)
        p, r, f = f_score(correct, p_total, r_total)
        write('\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})'.format(
            f, p, int(correct), int(p_total), r, int(correct), int(r_total))
        )
        return f

    def metrics(self, **kwargs):
        raise NotImplementedError

class SpanEvaluator(Evaluator):
    def metrics(self, y_true, y_pred, vocab_label):
        """
        :param y_true: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
        :param y_pred: 1D: n_batches, 2D: batch_size, 3D: n_spans, 4D: [label_id, pre_index, post_index]
        """
        p_total = 0.
        r_total = 0.
        correct = 0.
        for span_true_batch, span_pred_batch in zip(y_true, y_pred):
            for spans_true, spans_pred in zip(span_true_batch, span_pred_batch):
                spans_true = concat_c_spans_from_spans(spans_true, vocab_label)
                spans_pred = concat_c_spans_from_spans(spans_pred, vocab_label)
                p_total += len(spans_pred)
                r_total += len(spans_true)
                for span in spans_pred:
                    if span in spans_true:
                        correct += 1
        return correct, p_total, r_total


Decoder part

In [12]:
from copy import deepcopy
from itertools import combinations_with_replacement as comb

def span_to_span_index(i, j, n_words):
    return i * (n_words - 1) + j - np.arange(i).sum()

class Decoder(object):
    def __init__(self, argv, vocab_label):
        self.argv = argv
        self.core_label_ids = self.set_core_labels(vocab_label)
        self.span_list = None

    def set_core_labels(self, vocab_label):
        if self.argv.data_type == 'conll05':
            core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"]
        else:
            core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"]
        return [vocab_label.get_id(label)
                for label in core_labels
                if vocab_label.has_key(label)]

    def argmax_span_triples(self, span_indices, marks):
        """
        :param span_indices: 1D: batch_size, 2D; n_labels; span index
        :param marks: 1D: batch_size, 2D; n_words
        :return: 1D: batch_size, 2D: n_spans; [r, i, j]
        """
        n_words = len(marks[0])
        self.span_list = list(comb(range(n_words), 2))
        return [self._argmax_search(span_indices_i, mark)
                for span_indices_i, mark in zip(span_indices, marks)]

    def _argmax_search(self, span_indices, mark):
        spans = []
        prd_index = mark.nonzero()[0][0]
        for r, span_index in enumerate(span_indices):
            (i, j) = self.span_list[span_index]
            if i <= prd_index <= j:
                continue
            spans.append([r, i, j])
        return spans

    def greedy_span_triples(self, scores, marks):
        """
        :param scores: 1D: batch_size, 2D; n_labels, 3D: n_spans; score
        :param marks: 1D: batch_size, 2D; n_words
        :return: 1D: batch_size, 2D: n_spans; [r, i, j]
        """
        n_words = len(marks[0])
        self.span_list = list(comb(range(n_words), 2))
        return [self._greedy_search(score, mark)
                for score, mark in zip(scores, marks)]

    def _greedy_search(self, scores, mark):
        """
        :param scores: 1D: n_labels, 2D: n_spans; score
        :param mark: 1D: n_words; elem=0/1
        :return: 1D: n_spans, 2D: [r, i, j]
        """
        triples = []
        used_words = deepcopy(mark)
        used_labels = []

        n_words = len(mark)
        prd_index = mark.nonzero()[0][0]
        prd_span_index = span_to_span_index(i=prd_index,
                                            j=prd_index,
                                            n_words=n_words)
        spans = self._sort_spans(scores=scores,
                                 prd_index=prd_index,
                                 prd_span_index=prd_span_index)

        for (r, i, j, _) in spans:
            if r in used_labels:
                continue
            if used_words[i: j + 1].sum() > 0:
                continue

            triples.append([r, i, j])

            used_words[i: j + 1] = 1
            if r in self.core_label_ids:
                used_labels.append(r)

        return triples

    def _sort_spans(self, scores, prd_index, prd_span_index):
        """
        :param scores: 1D: n_labels, 2D: n_spans; score
        :return: 1D: n_labels, 2D: n_words * n_words; elem=(r, i, j, score)
        """
        spans = []
        for r, scores_row in enumerate(scores):
            score_prd = scores_row[prd_span_index]
            for index, score in enumerate(scores_row):
                (i, j) = self.span_list[index]
                if i <= prd_index <= j:
                    continue
                if score_prd < score:
                    spans.append((r, i, j, score))
        spans.sort(key=lambda span: span[-1], reverse=True)
        return spans

In [13]:
class Initializer(object):
    def __call__(self, shape, shared=True, name=None):
        raise NotImplementedError


class Zero(Initializer):
    def __call__(self, shape, shared=True, name=None):
        param = np.zeros(shape, theano.config.floatX)
        if shared:
            return theano.shared(value=param, name=name, borrow=True)
        return param


class One(Initializer):
    def __call__(self, shape, shared=True, name=None):
        param = np.ones(shape, theano.config.floatX)
        if shared:
            return theano.shared(value=param, name=name, borrow=True)
        return param


class Identity(Initializer):
    def __call__(self, shape, shared=True, name=None):
        assert len(shape) == 2
        param = np.ones(shape[0], theano.config.floatX)
        param = np.diag(param)
        if shared:
            return theano.shared(value=param, name=name, borrow=True)
        return param

class Uniform(Initializer):
    def __call__(self, shape, shared=True, name=None):
        param = np.asarray(np.random.uniform(low=-0.01,
                                             high=0.01,
                                             size=shape),
                           dtype=theano.config.floatX)
        if shared:
            return theano.shared(value=param, name=name, borrow=True)
        return param


class Normal(Initializer):
    def __call__(self, shape, shared=True, name=None):
        param = np.asarray(np.random.normal(0.0, 0.01, shape),
                           dtype=theano.config.floatX)
        if shared:
            return theano.shared(value=param, name=name, borrow=True)
        return param


class Xavier(Initializer):
    def __call__(self, shape, shared=True, name=None):
        param = np.asarray(np.random.uniform(low=-np.sqrt(6.0 / np.sum(shape)),
                                             high=np.sqrt(6.0 / np.sum(shape)),
                                             size=shape),
                           dtype=theano.config.floatX)
        if shared:
            return theano.shared(value=param, name=name, borrow=True)
        return param


class Orthonormal(Initializer):
    
    def __call__(self, shape, shared=True, name=None):
        assert len(shape) == 2
        if shape[0] == shape[1]:
            M = np.random.randn(*shape).astype(theano.config.floatX)
            Q, R = np.linalg.qr(M)
            Q = Q * np.sign(np.diag(R))
            param = Q * 1.0
        else:
            M1 = np.random.randn(shape[0], shape[0]).astype(theano.config.floatX)
            M2 = np.random.randn(shape[1], shape[1]).astype(theano.config.floatX)
            Q1, R1 = np.linalg.qr(M1)
            Q2, R2 = np.linalg.qr(M2)
            Q1 = Q1 * np.sign(np.diag(R1))
            Q2 = Q2 * np.sign(np.diag(R2))
            n_min = min(shape[0], shape[1])
            param = np.dot(Q1[:, :n_min], Q2[:n_min, :]) * 1.0
        if shared:
            return theano.shared(value=param, name=name, borrow=True)
        return param

def softmax(x):
    if x.ndim == 3:
        x_shape = x.shape
        x = x.reshape((x_shape[0] * x_shape[1], x_shape[2]))
        return T.nnet.softmax(x).reshape(x_shape)
    elif x.ndim == 4:
        x_shape = x.shape
        x = x.reshape((x_shape[0] * x_shape[1] * x_shape[2], x_shape[3]))
        return T.nnet.softmax(x).reshape(x_shape)
    return T.nnet.softmax(x)


def sigmoid(x):
    return T.nnet.sigmoid(x)


def tanh(x):
    return T.tanh(x)


def relu(x):
    return T.nnet.relu(x)

In [14]:

class Unit(object):
    def __init__(self, name='unit'):
        self.name = name

    @staticmethod
    def _set_param(shape, init_type=None, name=None):
        if init_type == 'zero':
            init = Zero()
        elif init_type == 'one':
            init = One()
        elif init_type == 'xavier':
            init = Xavier()
        elif init_type == 'orth':
            init = Orthonormal()
        elif init_type == 'identity':
            init = Identity()
        elif init_type == 'uniform':
            init = Uniform()
        else:
            init = Normal()
        return init(shape=shape, name=name)

    @staticmethod
    def _set_activation(activation_type):
        if activation_type == 'sigmoid':
            return sigmoid
        elif activation_type == 'tanh':
            return tanh
        elif activation_type == 'relu':
            return relu
        elif activation_type == 'softmax':
            return softmax
        return None

class Dense(Unit):
    def __init__(self,
                 input_dim,
                 output_dim,
                 activation=None,
                 use_bias=True,
                 weight_init='xavier',
                 bias_init='zero'):
        super(Dense, self).__init__(name='Dense(%dx%d,%s)' % (input_dim, output_dim, activation))

        self.W = self._set_param(shape=(input_dim, output_dim),
                                 init_type=weight_init,
                                 name='W_dense')
        if use_bias:
            self.b = self._set_param(shape=output_dim,
                                     init_type=bias_init,
                                     name='b_dense')
            self.params = [self.W, self.b]
        else:
            self.b = None
            self.params = [self.W]

        self.activation = self._set_activation(activation)

    def forward(self, x):
        h = T.dot(x, self.W)
        if self.b:
            h = h + self.b
        if self.activation:
            h = self.activation(h)
        return h

class Dropout(Unit):
    """
    Reference: [Dropout: A Simple Way to Prevent Neural Networks from Overfitting]
    """
    def __init__(self, rate, seed=0):
        super(Dropout, self).__init__(name='Dropout(p={:>1.1})'.format(rate))
        self.rate = min(1., max(0., rate))
        self.srng = T.shared_randomstreams.RandomStreams(seed=seed)

    def forward(self, x, is_train):
        drop_mask = self.srng.binomial(size=x.shape, n=1, p=1 - self.rate, dtype=theano.config.floatX)
        return T.switch(T.eq(is_train, 1), x * drop_mask, x * (1 - self.rate))

class Embedding(Unit):
    def __init__(self,
                 input_dim,
                 output_dim,
                 init_emb=None,
                 param_init='xavier',
                 param_fix=False,
                 drop_rate=0.0,
                 name=None):
        super(Embedding, self).__init__(name=name if name else 'Emb(%dx%d)' % (input_dim, output_dim))
        self.dropout = Dropout(drop_rate)

        self.W = self._set_weight(input_dim, output_dim, init_emb, param_init)
        if param_fix:
            self.params = []
        else:
            self.params = [self.W]

    def _set_weight(self, input_dim, output_dim, init_emb, param_init):
        if init_emb is None:
            return self._set_param(shape=(input_dim, output_dim),
                                   init_type=param_init,
                                   name='embedding')
        return theano.shared(init_emb)

    def forward(self, x, is_train=0):
        return self.dropout.forward(x=self.W[x], is_train=is_train)

LSTM Code

In [15]:
class LSTM(Unit):
    def __init__(self,
                 input_dim,
                 output_dim,
                 use_bias=True,
                 recurrent_init='orth',
                 bias_init='zero'):
        super(LSTM, self).__init__(name='LSTM(%dx%d)' % (input_dim, output_dim))

        self.input_dim = input_dim
        self.output_dim = output_dim

        # inout gate parameters
        self.W_xi = self._set_param(shape=(input_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_xi')
        self.W_hi = self._set_param(shape=(output_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_hi')
        self.W_ci = self._set_param(shape=output_dim,
                                    init_type='xavier',
                                    name='W_ci')

        # forget gate parameters
        self.W_xf = self._set_param(shape=(input_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_xf')
        self.W_hf = self._set_param(shape=(output_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_hf')
        self.W_cf = self._set_param(shape=output_dim,
                                    init_type='xavier',
                                    name='W_cf')

        # cell parameters
        self.W_xc = self._set_param(shape=(input_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_xc')
        self.W_hc = self._set_param(shape=(output_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_hc')

        # output gate parameters
        self.W_xo = self._set_param(shape=(input_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_xf')
        self.W_ho = self._set_param(shape=(output_dim, output_dim),
                                    init_type=recurrent_init,
                                    name='W_hf')
        self.W_co = self._set_param(shape=output_dim,
                                    init_type='xavier',
                                    name='W_cf')

        if use_bias:
            self.b_xi = self._set_param(shape=output_dim,
                                        init_type=bias_init,
                                        name='b_xi')
            self.b_xf = self._set_param(shape=output_dim,
                                        init_type='one',
                                        name='b_xf')
            self.b_xc = self._set_param(shape=output_dim,
                                        init_type=bias_init,
                                        name='b_xc')
            self.b_xo = self._set_param(shape=output_dim,
                                        init_type=bias_init,
                                        name='b_xo')
            self.params = [self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf,
                           self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co,
                           self.b_xi, self.b_xf, self.b_xc, self.b_xo]
        else:
            self.b_xi = None
            self.b_xf = None
            self.b_xc = None
            self.b_xo = None
            self.params = [self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf,
                           self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co]

    def _step(self, xi_t, xf_t, xc_t, xo_t, h_tm1, c_tm1):
        i_t = sigmoid(xi_t + T.dot(h_tm1, self.W_hi) + c_tm1 * self.W_ci)
        f_t = sigmoid(xf_t + T.dot(h_tm1, self.W_hf) + c_tm1 * self.W_cf)
        c_t = f_t * c_tm1 + i_t * tanh(xc_t + T.dot(h_tm1, self.W_hc))
        o_t = sigmoid(xo_t + T.dot(h_tm1, self.W_ho) + c_t * self.W_co)
        h_t = o_t * tanh(c_t)
        return h_t, c_t

    def forward(self, x, h0=None, mask=None):
        xi = T.dot(x, self.W_xi) + self.b_xi
        xf = T.dot(x, self.W_xf) + self.b_xf
        xc = T.dot(x, self.W_xc) + self.b_xc
        xo = T.dot(x, self.W_xo) + self.b_xo

        inputs = [xi, xf, xc, xo]

        if h0 is None:
            h0 = T.zeros(shape=(x[0].shape[0], self.output_dim), dtype=theano.config.floatX)
        c0 = T.zeros(shape=(x[0].shape[0], self.output_dim), dtype=theano.config.floatX)

        [h, _], _ = theano.scan(fn=self._step,
                                sequences=inputs,
                                outputs_info=[h0, c0])
        return h

class StackLayer(object):
    def __init__(self, name='StackLayer'):
        self.name = name
        self.layers = []
        self.params = []

    def _set_layers(self):
        raise NotImplementedError

    @staticmethod
    def _set_rnn_unit(unit_type):
        return LSTM

    @staticmethod
    def _set_connect_unit(connect_type):
        return Dense

    def _set_params(self):
        params = []
        for layer in self.layers:
            params.extend(layer.params)
        return params

    def forward(self, x, **kwargs):
        raise NotImplementedError


class BiRNNLayer(StackLayer):
    def __init__(self,
                 input_dim,
                 output_dim,
                 n_layers,
                 unit_type,
                 connect_type,
                 drop_rate=0.0):
        name = 'BiRNNs-%d:(%dx%d)' % (n_layers, input_dim, output_dim)
        super(BiRNNLayer, self).__init__(name=name)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.rnn_unit = self._set_rnn_unit(unit_type)
        self.connect_unit = self._set_connect_unit(connect_type)
        self.dropout = Dropout(drop_rate)

        self.layers = self._set_layers()
        self.params = self._set_params()

    def _set_layers(self):
        layers = []
        for i in range(self.n_layers):
            if i == 0:
                rnn_input_dim = self.input_dim
                connect_input_dim = self.input_dim + self.output_dim
            else:
                rnn_input_dim = self.output_dim
                connect_input_dim = self.output_dim * 2

            r_unit = self.rnn_unit(input_dim=rnn_input_dim,
                                   output_dim=self.output_dim)
            c_unit = self.connect_unit(input_dim=connect_input_dim,
                                       output_dim=self.output_dim,
                                       activation='relu')
            layers += [r_unit, c_unit]
        return layers

    def forward(self, x, mask=None, is_train=False):
        n_layers = int(len(self.layers) / 2)
        for i in range(n_layers):
            if mask is None:
                h = self.layers[i * 2].forward(x=x)
                h = self.dropout.forward(x=h, is_train=is_train)
                x = self.layers[i * 2 + 1].forward(T.concatenate([x, h], axis=2))
            else:
                h = self.layers[i * 2].forward(x=x, mask=mask)
                h = self.dropout.forward(x=h, is_train=is_train)
                x = self.layers[i * 2 + 1].forward(T.concatenate([x, h], axis=2)) * mask
                mask = mask[::-1]
            x = x[::-1]
        if (n_layers % 2) == 1:
            return x[::-1]
        return x

In [16]:
class Regularizer(object):
    def __call__(self, **kwargs):
        raise NotImplementedError


class L2Regularizer(Regularizer):
    def __call__(self, alpha, params):
        return alpha * l2_sqr(params) / 2.

def l2_sqr(params):
    sqr = 0.0
    for p in params:
        sqr += T.sum((p ** 2))
    return sqr

def logsumexp3d(x, axis=2):
    # 1D: batch_size, 2D: n_labels, 3D: 1
    x_max = T.max(x, axis=axis, keepdims=True)
    # 1D: batch_size, 2D: n_labels
    return T.log(T.sum(T.exp(x - x_max), axis=axis)) + x_max.dimshuffle(0, 1)

Span Models

In [17]:
class Model(object):
    def __init__(self):
        self.is_train = theano.shared(0, borrow=True)
        self.inputs = None
        self.outputs = None
        self.dropout = None
        self.input_layers = []
        self.hidden_layers = []
        self.output_layers = []
        self.layers = []
        self.params = []

    def compile(self, **kwargs):
        raise NotImplementedError

    def _set_params(self):
        for l in self.layers:
            self.params += l.params

class FeatureLayer(Model):
    def compile(self, **kwargs):
        self._set_layers(kwargs)
        self._set_params()

    def forward(self, inputs):
        embs = []
        for i in range(len(inputs)):
            # 1D: batch_size, 2D: n_words, 3D: input_dim
            emb_i = self.input_layers[i].forward(x=inputs[i],
                                                 is_train=self.is_train)
            embs.append(emb_i)

        # 1D: batch_size, 2D: n_words, 3D: input_dim
        x = T.concatenate(tensor_list=embs, axis=2)
        # 1D: n_words, 2D: batch_size, 3D: hidden_dim
        h = self.hidden_layers[0].forward(x=x.dimshuffle(1, 0, 2),
                                          is_train=self.is_train)
        return h

    def _set_layers(self, args):
        x_w_dim, x_m_dim = args['input_dim']
        hidden_dim = args['hidden_dim']
        drop_rate = args['drop_rate']

        if args['vocab_word_size'] > 0:
            emb_word = Embedding(input_dim=args['vocab_word_size'],
                                 output_dim=x_w_dim,
                                 init_emb=args['word_emb'],
                                 param_fix=True,
                                 drop_rate=drop_rate,
                                 name='EmbWord')
            self.input_layers.append(emb_word)

        if args['use_elmo']:
            emb_elmo = ElmoLayer(drop_rate=0.5,
                                 name='EmbElmo')
            self.input_layers.append(emb_elmo)

        emb_mark = Embedding(input_dim=2,
                             output_dim=x_m_dim,
                             init_emb=None,
                             param_init='xavier',
                             param_fix=False,
                             drop_rate=drop_rate,
                             name='EmbMark')
        self.input_layers.append(emb_mark)

        if args['use_elmo']:
            hidden_input_dim = (len(self.input_layers) - 2) * x_w_dim + x_m_dim + 1024
        else:
            hidden_input_dim = (len(self.input_layers) - 1) * x_w_dim + x_m_dim
        hidden_layer = BiRNNLayer(input_dim=hidden_input_dim,
                                  output_dim=hidden_dim,
                                  n_layers=args['n_layers'],
                                  unit_type='lstm',
                                  connect_type='dense',
                                  drop_rate=drop_rate)
        self.hidden_layers = [hidden_layer]
        self.layers = self.input_layers + self.hidden_layers

class LabelLayer(Model):
    def compile(self, **kwargs):
        self._set_layers(hidden_dim=kwargs['feat_dim'],
                         output_dim=kwargs['output_dim'])
        self._set_params()

    def _set_layers(self, hidden_dim, output_dim):
        self.layers = [Dense(input_dim=hidden_dim,
                             output_dim=output_dim)]

    def span_feats2(self, h):
        """
        :param h: 1D: n_words, 2D: batch_size, 3D: hidden_dim
        :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
        """
        h = h.dimshuffle(1, 0, 2)
        n_words = h.shape[1]

        m = T.triu(T.ones(shape=(n_words, n_words)))
        indices = m.nonzero()

        # 1D: batch_size, 2D: n_spans, 3D: hidden_dim
        h_i = h[:, indices[0]]
        h_j = h[:, indices[1]]

        h_diff = h_i - h_j
        h_add = h_i + h_j

        return T.concatenate([h_add, h_diff], axis=2)

    def span_feats(self, h):
        h = h.dimshuffle(1, 0, 2)
        n_words = h.shape[1]
        pad = T.zeros(shape=(h.shape[0], 1, h.shape[2]))
        h_pad = T.concatenate([h, pad], axis=1)

        m = T.triu(T.ones(shape=(n_words, n_words)))
        indices = m.nonzero()

        # 1D: batch_size, 2D: n_spans, 3D: hidden_dim
        h_i = h[:, indices[0]]
        h_j = h_pad[:, indices[1] + 1]

        h_diff = h_i - h_j
        h_add = h_i + h_j

        return T.concatenate([h_add, h_diff], axis=2)

    def logit_scores(self, h):
        return self.layers[-1].forward(h).dimshuffle(0, 2, 1)

class SpanModel(Model):
    def __init__(self):
        super(SpanModel, self).__init__()
        self.feat_layer = None
        self.label_layer = None

    def compile(self, inputs, **kwargs):
        self.inputs = inputs
        self.feat_layer = FeatureLayer()
        self.feat_layer.compile(**kwargs)
        self.label_layer = LabelLayer()
        self.label_layer.compile(**kwargs)
        self.layers = self.feat_layer.layers + self.label_layer.layers
        self._set_params()

    def span_feats(self, inputs):
        """
        :param inputs: 1D: n_inputs, 2D: batch_size, 3D: n_words; feat id
        :return: 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
        """
        # 1D: n_words, 2D: batch_size, 3D: 2 * hidden_dim
        h_rnn = self.feat_layer.forward(inputs)
        return self.label_layer.span_feats(h_rnn)

    @staticmethod
    def argmax_span(span_score):
        """
        :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans
        :return: 1D: batch_size, 2D: n_labels; span index
        """
        return T.argmax(span_score, axis=2)

    @staticmethod
    def loss(span_score, span_true):
        """
        :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans
        :param span_true: 1D: batch_size * n_spans; (batch index, label id, span index)
        """
        batch_size = span_score.shape[0]

        # 1D: batch_size * n_spans; index
        batch_index = span_true[:, 0]
        label_index = span_true[:, 1]
        span_index = span_true[:, 2]

        # 1D: batch_size * n_spans; score
        true_span_score = span_score[batch_index, label_index, span_index]

        # 1D: batch_size, 2D: n_labels; elem=score
        z = logsumexp3d(span_score, axis=2)
        # 1D: batch_size * n_spans; score
        z = z[batch_index, label_index]

        # 1D: batch_size * n_spans; score
        nll = true_span_score - z

        return - T.sum(nll) / batch_size

    @staticmethod
    def exp_score(span_score):
        """
        :param span_score: 1D: batch_size, 2D: n_labels, 3D: n_spans; logit score
        :return: 1D: batch_size, 2D: n_labels, 3D: n_spans
        """
        return T.exp(span_score)


In [18]:
import gzip
import pickle

def save_pickle(fn, data):
    with gzip.open(fn + '.pkl.gz', 'wb') as gf:
        pickle.dump(data, gf, pickle.HIGHEST_PROTOCOL)

def load_pickle(fn):
    with gzip.open(fn, 'rb') as gf:
        return pickle.load(gf)


Optimizers

In [19]:
def get_optimizer(argv):
    if argv.opt_type == 'adam':
        return Adam(argv=argv, lr=argv.lr, grad_clip=argv.grad_clip)
    return SGD(argv=argv, lr=argv.lr, grad_clip=argv.grad_clip)

class Optimizer(object):
    def __init__(self, **kwargs):
        self.argv = kwargs['argv']
        self.grad_clip = kwargs['grad_clip']
        self.params = []

    def __call__(self, grads, params):
        raise NotImplementedError

    def set_params(self, **kwargs):
        raise NotImplementedError

    def init_params(self):
        for p in self.params:
            p.set_value(p.get_value(borrow=True) * 0)

    @staticmethod
    def _grad_clipping(gradients, max_norm=5.0):
        global_grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gradients)))
        multiplier = T.switch(global_grad_norm < max_norm, 1.0, max_norm / global_grad_norm)
        return [g * multiplier for g in gradients]

    def save_params(self, epoch=0):
        argv = self.argv
        if argv.output_dir:
            dir_name = argv.output_dir
        else:
            dir_name = 'output'
        if argv.output_fn:
            file_name = '/opt.param.%s.epoch-%d' % (argv.output_fn, epoch)
        else:
            file_name = '/opt.param.%s.epoch-%d' % (argv.method, epoch)

        fn = dir_name + file_name
        params = [p.get_value(borrow=True) for p in self.params]
        save_pickle(fn=fn, data=params)

    def load_params(self, path):
        params = load_pickle(path)
        assert len(self.params) == len(params)
        for p1, p2 in zip(self.params, params):
            p1.set_value(p2)


class SGD(Optimizer):
    def __init__(self, lr=0.001, **kwargs):
        super(SGD, self).__init__(**kwargs)
        self.lr = theano.shared(np.asarray(lr, dtype=theano.config.floatX), borrow=True)

    def __call__(self, params, grads):
        updates = []
        if self.grad_clip:
            grads = self._grad_clipping(grads, max_norm=1.0)
        for p, g in zip(params, grads):
            updates.append((p, p - self.lr * g))
        return updates

    def set_params(self):
        pass


class Adam(Optimizer):
    def __init__(self, lr=0.001, b1=0.9, b2=0.999, eps=1e-8, **kwargs):
        super(Adam, self).__init__(**kwargs)
        self.lr = theano.shared(np.asarray(lr, dtype=theano.config.floatX), borrow=True)
        self.b1 = b1
        self.b2 = b2
        self.eps = eps

    def __call__(self, params, grads):
        updates = []

        i = self.params[0]
        i_t = i + 1.
        a_t = self.lr * T.sqrt(1 - self.b2 ** i_t) / (1 - self.b1 ** i_t)

        if self.grad_clip:
            grads = self._grad_clipping(grads, max_norm=1.0)

        for index, (p, g) in enumerate(zip(params, grads)):
            v = self.params[2 * index + 1]
            r = self.params[2 * index + 2]
            index += 2

            v_t = self.b1 * v + (1. - self.b1) * g
            r_t = self.b2 * r + (1. - self.b2) * g ** 2

            step = a_t * v_t / (T.sqrt(r_t) + self.eps)

            updates.append((v, v_t))
            updates.append((r, r_t))
            updates.append((p, p - step))

        updates.append((i, i_t))
        return updates

    def set_params(self, params):
        i = theano.shared(np.asarray(.0, dtype=theano.config.floatX))
        self.params.append(i)
        for p in params:
            p_tm = p.get_value(borrow=True)
            v = theano.shared(np.zeros(p_tm.shape, dtype=p_tm.dtype))
            r = theano.shared(np.zeros(p_tm.shape, dtype=p_tm.dtype))
            self.params += [v, r]

In [3]:
def correct_and_pred_spans(span_true, span_pred, marks):
    
    correct = 0.
    n_pred_spans = 0.
    n_words = len(marks[0])
    _, prd_indices = np.array(marks).nonzero()
    prd_indices = [span_to_span_index(p, p, n_words) for p in prd_indices]

    for b_index, span_pred_tmp in enumerate(span_pred):
        prd_index = prd_indices[b_index]
        for label_id, span_index in enumerate(span_pred_tmp):
            if span_index == prd_index:
                continue
            if [b_index, label_id, span_index] in span_true:
                correct += 1
            n_pred_spans += 1

    return correct, n_pred_spans



In [21]:

class SpanModelAPI(object):
    def __init__(self, argv):
        self.argv = argv

        self.model = None
        self.experts = None
        self.train_func = None
        self.pred_func = None

        self.vocab_word = None
        self.vocab_label = None
        self.vocab_label_valid = None

        self.input_dim = None
        self.hidden_dim = None
        self.output_dim = None
        self.use_elmo = None

        self.decoder = None
        self.optimizer = None

        self.n_true_spans = 0.

    def set_model(self, **kwargs):
        write('Setting a model...')
        argv = self.argv

        self.vocab_word = kwargs['vocab_word']
        self.use_elmo = kwargs['use_elmo']
        self.vocab_label = kwargs['vocab_label']
        self.vocab_label_valid = kwargs['vocab_label_valid']
        word_emb = kwargs['word_emb']
        vocab_word_size = self.vocab_word.size() if self.vocab_word else 0

        self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1]
        self.hidden_dim = argv.hidden_dim
        self.output_dim = -1

        self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label)

        self.model = SpanModel()
        self.model.compile(inputs=self._set_inputs(),
                           vocab_word_size=vocab_word_size,
                           use_elmo=self.use_elmo,
                           word_emb=word_emb,
                           input_dim=[self.input_dim, self.input_dim],
                           hidden_dim=self.hidden_dim,
                           feat_dim=2 * self.hidden_dim,
                           output_dim=self.vocab_label.size(),
                           n_layers=argv.n_layers,
                           drop_rate=argv.drop_rate)

        write('\t- {}'.format("\n\t- ".join([l.name for l in self.model.layers])))
        self._show_model_config()

    def set_ensemble_model(self, **kwargs):
        write('Setting a model...')
        argv = self.argv

        self.vocab_word = kwargs['vocab_word']
        self.use_elmo = kwargs['use_elmo']
        self.vocab_label = kwargs['vocab_label']
        self.vocab_label_valid = kwargs['vocab_label_valid']
        word_emb = kwargs['word_emb']
        vocab_word_size = self.vocab_word.size() if self.vocab_word else 0

        self.input_dim = argv.emb_dim if word_emb is None else word_emb.shape[1]
        self.hidden_dim = argv.hidden_dim
        self.output_dim = -1

        self.decoder = Decoder(argv=argv, vocab_label=self.vocab_label)

        
        inputs = self._set_inputs()
        self.model = MoEModel()
        self.model.compile(inputs=inputs,
                           feat_dim=2 * self.hidden_dim,
                           output_dim=self.vocab_label.size(),
                           drop_rate=argv.drop_rate,
                           n_experts=argv.n_experts)
        write('\t- {}\n'.format("\n\t- ".join([l.name for l in self.model.layers])))

        
        experts = []
        for _ in range(argv.n_experts):
            model = SpanModel()
            model.compile(inputs=self.model.inputs,
                          vocab_word_size=vocab_word_size,
                          use_elmo=self.use_elmo,
                          input_dim=[self.input_dim, self.input_dim],
                          hidden_dim=self.hidden_dim,
                          feat_dim=2 * self.hidden_dim,
                          output_dim=self.vocab_label.size(),
                          n_layers=argv.n_layers,
                          word_emb=word_emb,
                          drop_rate=argv.drop_rate)
            write('\t- {}\n'.format("\n\t- ".join([l.name for l in model.layers])))
            experts.append(model)

        self.experts = experts

    def _set_inputs(self):
        x = []
        if self.vocab_word:
            x.append(T.imatrix('x_word'))
        if self.use_elmo:
            x.append(T.ftensor4('x_elmo'))
        x.append(T.imatrix('x_mark'))
        assert len(x) > 1
        return x

    def _show_model_config(self):
        model = self.model
        write('Model configuration')
        write('\t- Input  Dim: {}'.format(self.input_dim))
        write('\t- Hidden Dim: {}'.format(self.hidden_dim))
        write('\t- Output Dim: {}'.format(self.output_dim))
        write('\t- Parameters: {}'.format(sum(len(x.get_value(borrow=True).ravel())
                                              for x in model.params)))

    def save_params(self, epoch=-1):
        argv = self.argv
        if argv.output_dir:
            dir_name = argv.output_dir
        else:
            dir_name = 'output'
        if argv.output_fn:
            file_name = '/param.%s.epoch-%d' % (argv.output_fn, epoch)
        else:
            file_name = '/param.epoch-%d' % epoch

        fn = dir_name + file_name
        params = [p.get_value(borrow=True) for p in self.model.params]
        save_pickle(fn=fn, data=params)

    def load_params(self, path):
        params = load_pickle(path)
        assert len(self.model.params) == len(params)
        for p1, p2 in zip(self.model.params, params):
            p1.set_value(p2)

    def load_experts_params(self, path):
        write('Loading experts params...')
        param_files = glob.glob(path + '/*')
        param_files = [fn for fn in param_files
                       if fn.split('/')[-1].startswith('param')]
        write("\t - Param Files: %s" % str(param_files))
        for i, path in enumerate(param_files[:self.argv.n_experts]):
            params = load_pickle(path)
            assert len(self.experts[i].params) == len(params)
            for p1, p2 in zip(self.experts[i].params, params):
                p1.set_value(p2)

    def set_init_ensemble_param(self):
        write('Initializing params...')
        W = np.zeros(shape=(2 * self.hidden_dim, self.vocab_label.size()),
                     dtype=theano.config.floatX)
        b = np.zeros(shape=self.vocab_label.size(),
                     dtype=theano.config.floatX)
        for model in self.experts:
            W += model.params[-2].get_value(borrow=True)
        for model in self.experts:
            b += model.params[-1].get_value(borrow=True)
        W = W / len(self.experts)
        b = b / len(self.experts)
        self.model.params[-2].set_value(W)
        self.model.params[-1].set_value(b)

    def set_train_func(self):
        write('Building a training function...')

        self.optimizer = get_optimizer(self.argv)
        self.optimizer.set_params(self.model.params)
        if self.argv.load_opt_param:
            self.optimizer.load_params(self.argv.load_opt_param)

        span_true = T.imatrix('span_true')

        h_span = self.model.span_feats(inputs=self.model.inputs)
        span_score = self.model.label_layer.logit_scores(h=h_span)
        span_pred = self.model.argmax_span(span_score=span_score)

        nll = self.model.loss(span_score, span_true)
        l2_reg = L2Regularizer()
        objective = nll + l2_reg(alpha=self.argv.reg,
                                 params=self.model.params)

        grads = T.grad(cost=objective, wrt=self.model.params)
        updates = self.optimizer(grads=grads, params=self.model.params)

        self.train_func = theano.function(
            inputs=self.model.inputs + [span_true],
            outputs=[objective, span_pred],
            updates=updates,
            mode='FAST_RUN'
        )

    def set_pred_func(self):
        write('Building a predicting function...')
        if self.argv.search == 'argmax':
            self.set_pred_argmax_func()
        else:
            self.set_pred_score_func()

    def set_pred_argmax_func(self):
        h_span = self.model.span_feats(inputs=self.model.inputs)
        logits = self.model.label_layer.logit_scores(h_span)
        span_pred = self.model.argmax_span(logits)

        self.pred_func = theano.function(
            inputs=self.model.inputs,
            outputs=span_pred,
            mode='FAST_RUN'
        )

    def set_pred_score_func(self):
        h_span = self.model.span_feats(inputs=self.model.inputs)
        logits = self.model.label_layer.logit_scores(h_span)
        span_score = self.model.exp_score(logits)

        self.pred_func = theano.function(
            inputs=self.model.inputs,
            outputs=span_score,
            mode='FAST_RUN'
        )

    def set_ensemble_train_func(self):
        write('Building an ensemble training function...')

        self.optimizer = get_optimizer(self.argv)
        self.optimizer.set_params(self.model.params)
        if self.argv.load_opt_param:
            self.optimizer.load_params(self.argv.load_opt_param)

        span_true = T.imatrix('span_true')

        h_span = self.model.feat_layer.forward(self.model.inputs,
                                               self.experts)
        logits = self.model.feat_layer.logit_scores(h=h_span)
        span_pred = self.model.argmax_span(logits)

        nll = self.model.loss(logits, span_true)
        l2_reg = L2Regularizer()
        objective = nll + l2_reg(alpha=self.argv.reg,
                                 params=self.model.params)

        grads = T.grad(cost=objective, wrt=self.model.params)
        updates = self.optimizer(grads=grads,
                                 params=self.model.params)

        self.train_func = theano.function(
            inputs=self.model.inputs + [span_true],
            outputs=[objective, span_pred],
            updates=updates,
            mode='FAST_RUN'
        )

    def set_ensemble_pred_func(self):
        write('Building an ensemble predicting function...')
        if self.argv.search == 'argmax':
            self.set_ensemble_pred_argmax_func()
        else:
            self.set_ensemble_pred_score_func()

    def set_ensemble_pred_argmax_func(self):
        # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
        h_span = self.model.feat_layer.forward(self.model.inputs,
                                               self.experts)
        # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
        span_score = self.model.feat_layer.logit_scores(h=h_span)
        # 1D: batch_size, 2D: n_labels; span index
        span_pred = self.model.argmax_span(span_score=span_score)

        self.pred_func = theano.function(
            inputs=self.model.inputs,
            outputs=span_pred,
            mode='FAST_RUN'
        )

    def set_ensemble_pred_score_func(self):
        # 1D: batch_size, 2D: n_spans, 3D: 2 * hidden_dim
        h_span = self.model.feat_layer.forward(self.model.inputs,
                                               self.experts)
        # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
        logits = self.model.feat_layer.logit_scores(h=h_span)
        # 1D: batch_size, 2D: n_labels, 3D: n_spans; score
        span_score = self.model.exp_score(logits)

        self.pred_func = theano.function(
            inputs=self.model.inputs,
            outputs=span_score,
            mode='FAST_RUN'
        )

    def train(self, batches):
        start = time.time()
        n_batches = 0.
        loss_total = 0.
        p_total = 0.
        correct = 0.

        self.model.feat_layer.is_train.set_value(1)
        if self.experts:
            for model in self.experts:
                model.feat_layer.is_train.set_value(1)

        for inputs in batches:
            n_batches += 1

            if n_batches % 100 == 0:
                sys.stdout.write("%d " % n_batches)
                sys.stdout.flush()

            n_words = len(inputs[0][0])
            if n_words < 2 or 100 < n_words:
                continue

            loss, span_pred = self.train_func(*inputs)

            if math.isnan(loss):
                write('\n\nNAN: Index: %d\n' % n_batches)
                exit()

            loss_total += loss
            correct_i, p_total_i = correct_and_pred_spans(span_true=inputs[-1],
                                                          span_pred=span_pred,
                                                          marks=inputs[1])
            correct += correct_i
            p_total += p_total_i

        self.model.feat_layer.is_train.set_value(0)
        if self.experts:
            for model in self.experts:
                model.feat_layer.is_train.set_value(0)

        avg_loss = loss_total / n_batches
        p, r, f = f_score(correct, p_total, self.n_true_spans)

        write('\n\tTime: %f seconds' % (time.time() - start))
        write('\tAverage Negative Log Likelihood: %f(%f/%d)' % (avg_loss, loss_total, n_batches))
        write('\tF:{:>7.2%}  P:{:>7.2%} ({:>5}/{:>5})  R:{:>7.2%} ({:>5}/{:>5})'.format(
            f, p, int(correct), int(p_total), r, int(correct), int(self.n_true_spans)))

    def predict(self, batches):
        if self.argv.search == 'argmax':
            return self.predict_argmax(batches)
        else:
            return self.predict_greedy(batches)

    def predict_argmax(self, batches):
        """
        :param batches: 1D: n_sents, 2D: n_prds, 3D: n_feats, 4D: n_words; elem=(x_w, x_m)
        :return: y: 1D: n_sents, 2D: n_prds, 3D: n_spans, 3D: [label_id, pre_index, post_index]
        """
        start = time.time()
        y = []

        for index, inputs in enumerate(batches):
            if (index + 1) % 100 == 0:
                sys.stdout.write("%d " % (index + 1))
                sys.stdout.flush()

            if len(inputs) == 0:
                span_triples = []
            else:
                span_pred = self.pred_func(*inputs)
                span_triples = self.decoder.argmax_span_triples(span_indices=span_pred,
                                                                marks=inputs[-1])
            y.append(span_triples)

        write('\n\tTime: %f seconds' % (time.time() - start))
        return y

    def predict_greedy(self, batches):
        """
        :param batches: 1D: n_sents, 2D: n_prds, 3D: n_feats, 4D: n_words; elem=(x_w, x_m)
        :return: y: 1D: n_sents, 2D: n_prds, 3D: n_spans, 3D: [label_id, pre_index, post_index]
        """
        start = time.time()
        y = []

        for index, inputs in enumerate(batches):
            if (index + 1) % 100 == 0:
                sys.stdout.write("%d " % (index + 1))
                sys.stdout.flush()

            if len(inputs) == 0:
                span_triples = []
            else:
                scores = self.pred_func(*inputs)
                span_triples = self.decoder.greedy_span_triples(scores=scores,
                                                                marks=inputs[-1])
            y.append(span_triples)

        write('\n\tTime: %f seconds' % (time.time() - start))
        return y

In [22]:
def count_true_spans(sents):
    """
    :param sents: 1D: n_sents
    :return: total number of spans
    """
    return sum([len(triple) for sent in sents for triple in sent.span_triples])

def load_emb(path):
    word_list = []
    emb = []
    with open(path) as f:
        for line in f:
            line = line.rstrip().split()
            word_list.append(line[0])
            emb.append(line[1:])
    emb = np.asarray(emb, dtype=theano.config.floatX)

    if UNK not in word_list:
        word_list = [UNK] + word_list
        unk_vector = np.mean(emb, axis=0)
        emb = np.vstack((unk_vector, emb))

    return word_list, emb

def write(s, stream=sys.stdout):
    stream.write(s + '\n')
    stream.flush()


def show_score_history(history, memo=''):
    write('F1 HISTORY' + memo)
    for k, v in sorted(history.items()):
        epoch_tm = '\t- EPOCH-{:d}  '.format(k)
        if len(v) == 1:
            f1_valid = '\tBEST VALID {:>7.2%}'.format(v[0])
            write(epoch_tm + f1_valid)
        else:
            v1, v2 = v
            f1_valid = '\tBEST VALID {:>7.2%}'.format(v1)
            f1_evalu = '\tEVALU {:>7.2%}'.format(v2)
            write(epoch_tm + f1_valid + f1_evalu)

def str_to_id(sent, vocab, unk):
    """
    :param sent: 1D: n_words
    :param vocab: Vocab()
    :return: 1D: n_words; elem=id
    """
    return list(map(lambda w: vocab.get_id(w) if vocab.has_key(w) else vocab.get_id(unk), sent))


def make_vocab_from_ids(key_value_format):
    vocab = Vocab()
    for key, value in key_value_format:
        vocab.add_word(key)
    return vocab


def array(sample, is_float=False):
    if is_float:
        return np.asarray(sample, dtype=theano.config.floatX)
    return np.asarray(sample, dtype='int32')


def average_vector(emb):
    return np.mean(np.asarray(emb[2:], dtype=theano.config.floatX), axis=0)


def unit_vector(vecs, axis):
    return vecs / np.sqrt(np.sum(vecs ** 2, axis=axis, keepdims=True))


def make_output_dir(argv):
    if argv.output_dir:
        output_dir = argv.output_dir
    else:
        output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)


def join_dir_and_file_names(dir_name, file_name):
    return os.path.join(dir_name, file_name)


def get_file_names_in_dir(dir_path, prefix=None, suffix=None):
    file_names = glob.glob(dir_path + '/*')
    if prefix:
        file_names = [fn for fn in file_names
                      if os.path.basename(fn).startswith(prefix)]
    if suffix:
        file_names = [fn for fn in file_names
                      if fn.endswith(suffix)]
    return file_names


def get_latest_param_fn(file_names):
    latest_epoch = -1
    latest_fn = None
    for fn in file_names:
        for elem in fn.split('.'):
            if elem.startswith('epoch'):
                epoch = int(elem[6:])
                if latest_epoch < epoch:
                    latest_epoch = epoch
                    latest_fn = fn
                    break
    assert latest_fn is not None
    return latest_fn, latest_epoch


def span_to_span_index(i, j, n_words):
    return i * (n_words - 1) + j - np.arange(i).sum()

Training the model

In [23]:
class Trainer(object):
    def __init__(self,
                 argv,
                 loader,
                 preprocessor,
                 evaluator,
                 model_api):
        self.argv = argv
        self.loader = loader
        self.preprocessor = preprocessor
        self.evaluator = evaluator
        self.model_api = model_api

        self.f1_history = {}
        self.best_valid_f1 = 0.0
        self.best_epoch = -1

    def train(self):
        write('\nTRAINING START\n')

        argv = self.argv
        loader = self.loader
        pproc = self.preprocessor

        make_output_dir(self.argv)

        
        if argv.word_emb:
            write('Loading Word Embeddings...')
            word_list, word_emb = load_emb(argv.word_emb)
            vocab_word = pproc.make_vocab_word(word_list)
            write('\t- # Vocabs: %d' % vocab_word.size())
        else:
            vocab_word = word_emb = None


        
        write('Loading Corpus...')
        train_corpus = loader.load(path=argv.train_data,
                                   data_size=argv.data_size,
                                   is_test=False)
        valid_corpus = loader.load(path=argv.dev_data,
                                   data_size=argv.data_size,
                                   is_test=False)
        write('\t- # Sents: Train:%d  Valid:%d' % (len(train_corpus), len(valid_corpus)))

        
        train_sents = pproc.make_sents(train_corpus)
        valid_sents = pproc.make_sents(valid_corpus)

        
        write('Making Labels...')
        vocab_label_train = pproc.make_and_save_vocab_label(sents=train_sents,
                                                            vocab_label_init=None,
                                                            save=argv.save,
                                                            load=True)
        vocab_label_valid = pproc.make_and_save_vocab_label(sents=valid_sents,
                                                            vocab_label_init=vocab_label_train,
                                                            save=False,
                                                            load=False)
        write('\t- # Labels: %d' % vocab_label_train.size())

        
        train_sents = pproc.set_sent_config(sents=train_sents,
                                            elmo_emb=train_elmo_emb,
                                            vocab_word=vocab_word,
                                            vocab_label=vocab_label_train)
        valid_sents = pproc.set_sent_config(sents=valid_sents,
                                            elmo_emb=valid_elmo_emb,
                                            vocab_word=vocab_word,
                                            vocab_label=vocab_label_valid)

        
        write('Making Samples...')
        train_samples = pproc.make_samples(sents=train_sents,
                                           is_valid_data=False)
        valid_samples = pproc.make_samples(sents=valid_sents,
                                           is_valid_data=True)
        write('\t- # Samples: Train:%d  Valid:%d' % (len(train_samples),
                                                     len(valid_samples)))

        
        if train_elmo_emb is not None:
            use_elmo = True
        else:
            use_elmo = False

        if argv.n_experts > 0:
            is_ensemble = True
        else:
            is_ensemble = False

        if argv.method == 'span':
            self.model_api.n_true_spans = count_true_spans(train_sents)

        if is_ensemble:
            self.model_api.set_ensemble_model(word_emb=word_emb,
                                              use_elmo=use_elmo,
                                              vocab_word=vocab_word,
                                              vocab_label=vocab_label_train,
                                              vocab_label_valid=vocab_label_valid)
            self.model_api.load_experts_params(argv.load_param_dir)
            self.model_api.set_init_ensemble_param()
            self.model_api.set_ensemble_train_func()
            if self.model_api.vocab_label_valid:
                self.model_api.set_ensemble_pred_func()
            init_epoch = 0
        else:
            self.model_api.set_model(word_emb=word_emb,
                                     use_elmo=use_elmo,
                                     vocab_word=vocab_word,
                                     vocab_label=vocab_label_train,
                                     vocab_label_valid=vocab_label_valid)
            if argv.load_param_latest:
                if argv.output_dir:
                    dir_name = argv.output_dir
                else:
                    dir_name = 'output'
                param_fns = get_file_names_in_dir(dir_path=dir_name,
                                                  prefix='param')
                opt_param_fns = get_file_names_in_dir(dir_path=dir_name,
                                                      prefix='opt')
                param_fn, latest_epoch = get_latest_param_fn(file_names=param_fns)
                opt_param_fn, _ = get_latest_param_fn(file_names=opt_param_fns)
                self.model_api.argv.load_param = param_fn
                self.model_api.argv.load_opt_param = opt_param_fn
                self.model_api.load_params(param_fn)
                init_epoch = latest_epoch + 1
            elif argv.load_param:
                self.model_api.load_params(argv.load_param)
                init_epoch = 0
            else:
                init_epoch = 0

            self.model_api.set_train_func()
            if self.model_api.vocab_label_valid:
                self.model_api.set_pred_func()

        
        self._run_epochs(train_samples, valid_samples, init_epoch)

    def _run_epochs(self, train_samples, valid_samples=None, init_epoch=0):
        write('\nTRAIN START')

        argv = self.argv
        pproc = self.preprocessor
        vocab_label_valid = self.model_api.vocab_label_valid

        if valid_samples:
            valid_batches = pproc.make_batches(samples=valid_samples,
                                               is_valid_data=True)
            valid_batch_x, valid_batch_y = pproc.split_x_and_y(valid_batches)
        else:
            valid_batch_x = valid_batch_y = []

        
        if (argv.load_param or argv.load_param_dir) and valid_samples:
            write('\nEpoch: 0 (Using the Pre-trained Params)')
            write('VALID')
            valid_batch_y_pred = self.model_api.predict(valid_batch_x)
            self.best_valid_f1 = self.evaluator.f_score(y_true=valid_batch_y,
                                                        y_pred=valid_batch_y_pred,
                                                        vocab_label=vocab_label_valid)

        
        for epoch in range(init_epoch, argv.epoch):
            write('\nEpoch: %d' % (epoch + 1))
            write('TRAIN')

            if argv.halve_lr and epoch > 49 and (epoch % 25) == 0:
                lr = self.model_api.optimizer.lr.get_value(borrow=True)
                self.model_api.optimizer.lr.set_value(lr * 0.5)
                write('### HALVE LEARNING RATE: %f -> %f' % (lr, lr * 0.5))

            
            train_batches = pproc.make_batches(train_samples)
            self.model_api.train(train_batches)

            
            if valid_samples:
                write('VALID')
                valid_batch_y_pred = self.model_api.predict(valid_batch_x)
                valid_f1 = self.evaluator.f_score(y_true=valid_batch_y,
                                                  y_pred=valid_batch_y_pred,
                                                  vocab_label=vocab_label_valid)
                if self.best_valid_f1 < valid_f1:
                    self.best_valid_f1 = valid_f1
                    self.best_epoch = epoch
                    self.f1_history[self.best_epoch + 1] = [self.best_valid_f1]

                    if argv.save:
                        self.model_api.save_params(epoch=0)
                        self.model_api.optimizer.save_params(epoch=0)

            show_score_history(self.f1_history)

In [32]:
def parse_args(argument):
    parser = argparse.ArgumentParser(description='SPAN SELECTION MODEL')

    parser.add_argument('--mode', default='train', help='train/test')
    
    # Input Datasets 
    parser.add_argument('--train_data', help='path to train data')
    parser.add_argument('--dev_data', help='path to dev data')
    parser.add_argument('--test_data', help='path to test data')
    parser.add_argument('--data_type', default='conll12', help='conll05/conll12')
    parser.add_argument('--data_size', type=int, default=100000000, help='data size to be used')

    
    # Output Options 
    parser.add_argument('--save', action='store_true', default=False, help='parameters to be saved or not')
    parser.add_argument('--output_dir', type=str, default='output', help='output directory name')
    parser.add_argument('--output_fn', type=str, default=None, help='output file name')

    # Search 
    parser.add_argument('--search', type=str, default='greedy', help='argmax/greedy')

    
    # NN Architecture 
    parser.add_argument('--emb_dim', type=int, default=50, help='dimension of embeddings')
    parser.add_argument('--hidden_dim', type=int, default=32, help='dimension of hidden layer')
    parser.add_argument('--n_layers', type=int, default=1, help='number of layers')
    parser.add_argument('--n_experts', type=int, default=0, help='number of ensemble models')

    # Training 
    parser.add_argument('--epoch', type=int, default=30, help='number of epochs to train')
    parser.add_argument('--batch_size', type=int, default=32, help='mini-batch size')
    parser.add_argument('--word_emb', default=None, help='Initial embeddings to be loaded')

    # Optimization 
    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
    parser.add_argument('--halve_lr', action='store_true', default=False, help='halve learning rate')
    parser.add_argument('--opt_type', default='adam', help='sgd/adam')
    parser.add_argument('--grad_clip', action='store_true', default=False, help='gradient clipping')
    parser.add_argument('--reg', type=float, default=0.0001, help='L2 Reg rate')
    parser.add_argument('--drop_rate', type=float, default=0.0, help='Dropout Rate')

    
    # Loading Options 
    parser.add_argument('--load_param', default=None, help='path to params')
    parser.add_argument('--load_param_dir', default=None, help='path to param dir')
    parser.add_argument('--load_param_latest', action='store_true', default=False, help='load the latest params')
    parser.add_argument('--load_opt_param', default=None, help='path to params')
    parser.add_argument('--load_label', default=None, help='path to labels')

    return parser.parse_args(argument.split())

In [29]:
# Main part 
# training the model

argv = parse_args("--method span --mode train --train_data train_data.txt --dev_data dev_data.txt --data_type conll012 --drop_rate 0.1 --reg 0.0001 --hidden_dim 300 --n_layers 4 --halve_lr --word_emb senna.emb.txt --save --output_dir output")
np.random.seed(argv.seed)

loader = Conll12Loader(argv)

Trainer(argv=argv,
      loader=loader,
      preprocessor=SpanPreprocessor(argv),
      evaluator=SpanEvaluator(argv),
      model_api=SpanModelAPI(argv)
      ).train()


TRAINING START

Loading Word Embeddings...
	- # Vocabs: 130000
Loading Corpus...
	- # Sents: Train:749  Valid:819
Making Labels...
	- # Labels: 28
Making Samples...
	- # Samples: Train:1423  Valid:1942
Setting a model...
	- EmbWord
	- EmbMark
	- BiRNNs-4:(100x300)
	- Dense(600x28,None)
Model configuration
	- Input  Dim: 50
	- Hidden Dim: 300
	- Output Dim: -1
	- Parameters: 3326528
Building a training function...




Building a predicting function...

TRAIN START

Epoch: 1
TRAIN

	Time: 57.885332 seconds
	Average Negative Log Likelihood: 42.191060(3290.902646/78)
	F:  4.02%  P:  3.05% (  198/ 6489)  R:  5.89% (  198/ 3364)
VALID

	Time: 30.639748 seconds
	F: 10.15%  P: 22.57% (  269/ 1192)  R:  6.55% (  269/ 4109)
F1 HISTORY
	- EPOCH-1  	BEST VALID  10.15%

Epoch: 2
TRAIN

	Time: 58.745191 seconds
	Average Negative Log Likelihood: 13.068132(1019.314283/78)
	F: 22.87%  P: 35.41% (  568/ 1604)  R: 16.88% (  568/ 3364)
VALID

	Time: 30.854769 seconds
	F: 17.46%  P: 44.60% (  446/ 1000)  R: 10.85% (  446/ 4109)
F1 HISTORY
	- EPOCH-1  	BEST VALID  10.15%
	- EPOCH-2  	BEST VALID  17.46%

Epoch: 3
TRAIN

	Time: 57.140293 seconds
	Average Negative Log Likelihood: 11.064164(863.004819/78)
	F: 30.55%  P: 43.58% (  791/ 1815)  R: 23.51% (  791/ 3364)
VALID

	Time: 30.637875 seconds
	F: 28.68%  P: 41.87% (  896/ 2140)  R: 21.81% (  896/ 4109)
F1 HISTORY
	- EPOCH-1  	BEST VALID  10.15%
	- EPOCH-2  	BEST VALID  

In [30]:
def load_emb(path):
    word_list = []
    emb = []
    with open(path) as f:
        for line in f:
            line = line.rstrip().split()
            word_list.append(line[0])
            emb.append(line[1:])
    emb = np.asarray(emb, dtype=theano.config.floatX)

    if UNK not in word_list:
        word_list = [UNK] + word_list
        unk_vector = np.mean(emb, axis=0)
        emb = np.vstack((unk_vector, emb))

    return word_list, emb

def write(s, stream=sys.stdout):
    stream.write(s + '\n')
    stream.flush()

def make_vocab_from_ids(key_value_format):
    vocab = Vocab()
    for key, value in key_value_format:
        vocab.add_word(key)
    return vocab

In [31]:
class Tester(object):
    def __init__(self,
                 argv,
                 loader,
                 saver,
                 preprocessor,
                 evaluator,
                 model_api):
        self.argv = argv
        self.loader = loader
        self.saver = saver
        self.preprocessor = preprocessor
        self.evaluator = evaluator
        self.model_api = model_api

    def predict(self):
        argv = self.argv
        pproc = self.preprocessor
        loader = self.loader

        
        # Load dataset 
        write('Loading Dataset...')
        test_corpus = loader.load(path=argv.test_data,
                                  data_size=argv.data_size,
                                  is_test=True)
        test_sents = pproc.make_sents(test_corpus)

        
        # Load init emb 
        if argv.word_emb:
            write('Loading Embeddings...')
            word_list, word_emb = load_emb(argv.word_emb)
            vocab_word = pproc.make_vocab_word(word_list)
            write('\t- # Embedding Words: %d' % vocab_word.size())
        else:
            vocab_word = word_emb = None

        if argv.test_elmo_emb:
            write('Loading ELMo Embeddings...')
            test_elmo_emb = loader.load_hdf5(argv.test_elmo_emb)
        else:
            test_elmo_emb = None

        
        # Make labels 
        label_key_value = loader.load_key_value_format(argv.load_label)
        vocab_label = make_vocab_from_ids(label_key_value)
        write('\t- # Labels: %d' % vocab_label.size())

        
        # Set sent params 
        test_sents = pproc.set_sent_config(sents=test_sents,
                                           elmo_emb=test_elmo_emb,
                                           vocab_word=vocab_word,
                                           vocab_label=None)
        
        # Make samples 
        write('Making Test Samples...')
        test_batches = pproc.make_batch_per_sent(sents=test_sents)
        write('\t- # Test Samples: %d' % len(test_batches))

        
        # Model API 
        use_elmo = True if test_elmo_emb is not None else False

        if argv.n_experts > 0:
            self.model_api.set_ensemble_model(word_emb=word_emb,
                                              use_elmo=use_elmo,
                                              vocab_word=vocab_word,
                                              vocab_label=vocab_label,
                                              vocab_label_valid=None)
            self.model_api.load_params(argv.load_param)
            self.model_api.load_experts_params(argv.load_param_dir)
            self.model_api.set_ensemble_pred_func()
        else:
            self.model_api.set_model(word_emb=word_emb,
                                     use_elmo=use_elmo,
                                     vocab_word=vocab_word,
                                     vocab_label=vocab_label,
                                     vocab_label_valid=None)
            self.model_api.load_params(argv.load_param)
            self.model_api.set_pred_func()

        
        # Testing 
        write('\nPREDICTION START')
        test_y_pred = self.model_api.predict(test_batches)
        self.saver.save_props(corpus=test_sents,
                              labels=test_y_pred,
                              vocab_label=vocab_label)
        self.saver.save_json_format(corpus=test_sents,
                                    labels=test_y_pred,
                                    vocab_label=vocab_label)

In [27]:
class Saver(object):

    def __init__(self, argv):
        self.argv = argv

    def save_props(self, **kwargs):
        raise NotImplementedError

    def save_json_format(self, **kwargs):
        raise NotImplementedError

        
class SpanSaver(Saver):
    def save_props(self, corpus, labels, vocab_label):
        """
        :param corpus: 1D: n_sents, 2D: n_words; elem=line
        :param labels: 1D: n_sents, 2D: n_prds, 3D: n_spans, 4D: [r, i, j]
        """
        assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels))

        fn = self.argv.output_dir
        if self.argv.output_fn:
            fn += '/results.%s.prop' % self.argv.output_fn
        else:
            fn += '/results.prop'
        f = open(fn, 'w')

        for sent, spans_sent in zip(corpus, labels):
            columns = [[mark] for mark in sent.marks]
            n_words = sent.n_words
            assert len(sent.prd_indices) == len(spans_sent)
            for prd_index, spans in zip(sent.prd_indices, spans_sent):
                prop = self._span_to_prop(spans=spans,
                                          prd_index=prd_index,
                                          n_words=n_words,
                                          vocab_label=vocab_label)
                for i, p in enumerate(prop):
                    columns[i].append(p)
            for c in columns:
                f.write("%s\n" % "\t".join(c))
            f.write("\n")
        f.close()

    def save_json_format(self, corpus, labels, vocab_label):
        """
        :param corpus: 1D: n_sents, 2D: n_words; elem=line
        :param labels: 1D: n_sents, 2D: n_prds, 3D: n_spans, 4D: [r, i, j]
        :param vocab_label: Vocab()
        """
        assert len(corpus) == len(labels), '%d %d' % (len(corpus), len(labels))

        fn = self.argv.output_dir
        if self.argv.output_fn:
            fn += '/results.%s.json' % self.argv.output_fn
        else:
            fn += '/results.json'
        f = open(fn, 'w')

        corpus_dic = {}
        for sent_index, (sent, spans_sent) in enumerate(zip(corpus, labels)):
            assert len(sent.prd_indices) == len(spans_sent)

            prop_dic = {}
            for prd_index, spans in zip(sent.prd_indices, spans_sent):
                arg_dic = {}
                for (r, i, j) in spans:
                    key = '(%s,%d,%d)' % (vocab_label.get_word(r), i, j)
                    value = " ".join(sent.strings[i: j + 1])
                    arg_dic[key] = value

                prd_dic = {'prd': sent.forms[prd_index],
                           'arg': arg_dic}
                prop_dic['prd-%d' % prd_index] = prd_dic

            sent_dic = {'text': " ".join(sent.strings),
                        'mark': " ".join(sent.marks),
                        'prop': prop_dic}
            corpus_dic['sent-%d' % sent_index] = sent_dic

        json.dump(corpus_dic, f, indent=4)
        f.close()

    @staticmethod
    def _span_to_prop(spans, prd_index, n_words, vocab_label):
        """
        :param spans: 1D: n_spans, 2D: [r, i, j]
        :return: 1D: n_words; elem=str; e.g. (A0* & *)
        """
        prop = ['*' for _ in range(n_words)]
        prop[prd_index] = '(V*)'
        for (label_id, pre_index, post_index) in spans:
            label = vocab_label.get_word(label_id)
            if pre_index == post_index:
                prop[pre_index] = '(%s*)' % label
            else:
                prop[pre_index] = '(%s*' % label
                prop[post_index] = '*)'
        return prop

In [35]:
# testing the model
argv = parse_args("--method span --mode test --test_data test_data_short.txt --data_type conll12 --drop_rate 0.1 --hidden_dim 300 --n_layers 4 --output_dir output --output_fn conll2005.test --word_emb senna.emb.txt --load_label output/label_ids.txt --load_param output/param.epoch-0.pkl.gz")
np.random.seed(argv.seed)

Tester(argv=argv,
                   loader=loader,
                   saver=SpanSaver(argv),
                   preprocessor=SpanPreprocessor(argv),
                   evaluator=SpanEvaluator(argv),
                   model_api=SpanModelAPI(argv)
                   ).predict()

Loading Dataset...
Loading Embeddings...
	- # Embedding Words: 130000
	- # Labels: 28
Making Test Samples...
	- # Test Samples: 430
Setting a model...
	- EmbWord
	- EmbMark
	- BiRNNs-4:(100x300)
	- Dense(600x28,None)
Model configuration
	- Input  Dim: 50
	- Hidden Dim: 300
	- Output Dim: -1
	- Parameters: 3326528
Building a predicting function...

PREDICTION START
100 200 300 400 
	Time: 35.937697 seconds
