In [1]:
import codecs

import numpy as np
import tensorflow as tf


In [2]:
#Constants
START_TAG = '<BOS>'
STOP_TAG = '<EOS>'

### Load data and preprocess

In [3]:
def load_sentences(path):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = line.strip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences

In [4]:
train_sentences = load_sentences("data/eng.test.txt")
test_sentences = load_sentences("data/eng.test.txt")

### Create Mappings for Words, Characters and Tags

In [5]:
def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico

def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item

def word_mapping(sentences, lower=False):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000 #UNK tag for unknown words
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in words)
    ))
    return dico, word_to_id, id_to_word

def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char

def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico[START_TAG] = -1
    dico[STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag

In [6]:
dico_words, word_to_id, id_to_word = word_mapping(train_sentences)
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

Found 780 unique words (1855 in total)
Found 71 unique characters
Found 7 unique named entity tags


### Preparing final dataset

In [7]:
def lower_case(x,lower=False):
    if lower:
        return x.lower()  
    else:
        return x

In [8]:
def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    data = []
    for s in sentences:
        str_words = [w[0] for w in s]
        words = [word_to_id[lower_case(w,lower) if lower_case(w,lower) in word_to_id else '<UNK>']
                 for w in str_words]
        # Skip characters that are not in the training set
        chars = [[char_to_id[c] for c in w if c in char_to_id]
                 for w in str_words]
        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'str_words': str_words,
            'words': words,
            'chars': chars,
            'tags': tags,
        })
    return data

train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id
)
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id
)

In [9]:
train_data

[{'chars': [[38, 29, 44, 38, 57, 26, 27],
   [17],
   [41, 26, 44, 38, 26, 25, 27, 26, 29, 25, 47, 44, 29, 26],
   [27, 23, 57, 26],
   [40, 62, 26, 29],
   [23, 27],
   [27, 40, 51],
   [23, 55, 27, 26, 29],
   [44, 31, 31, 44, 31, 53, 25],
   [62, 44, 38, 27, 40, 29, 63],
   [20]],
  'str_words': ['CRICKET',
   '-',
   'LEICESTERSHIRE',
   'TAKE',
   'OVER',
   'AT',
   'TOP',
   'AFTER',
   'INNINGS',
   'VICTORY',
   '.'],
  'tags': [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],
  'words': [88, 15, 432, 512, 462, 338, 515, 154, 420, 538, 2]},
 {'chars': [[41, 40, 31, 45, 40, 31],
   [18, 32, 32, 48, 17, 37, 42, 17, 28, 37]],
  'str_words': ['LONDON', '1996-08-30'],
  'tags': [3, 0],
  'words': [67, 30]},
 {'chars': [[61, 0, 7, 2],
   [44, 3, 9, 4, 1, 3],
   [1, 10, 10, 17, 6, 5, 11, 3, 9, 0, 6],
   [51, 8, 4, 10],
   [25, 4, 13, 13, 5, 3, 7],
   [2, 5, 5, 36],
   [16, 5, 11, 6],
   [16, 5, 6],
   [28, 42],
   [5, 3],
   [55, 6, 4, 9, 1, 14],
   [1, 7],
   [41, 0, 4, 12, 0, 7, 2, 0, 6, 7, 8, 4,

In [10]:
train_chars = []
train_words = []
train_tags = []

for data in train_data:
    train_chars.append(data['chars'])
    train_words.append(data['words'])
    train_tags.append(data['tags'])

In [11]:
def _pad_sequences(sequences, pad_tok, max_length):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
    Returns:
        a list of list where each sublist has same length
    """
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length


def pad_sequences(sequences, pad_tok, nlevels=1):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
        nlevels: "depth" of padding, for the case where we have characters ids
    Returns:
        a list of list where each sublist has same length
    """
    if nlevels == 1:
        max_length = max(map(lambda x : len(x), sequences))
        sequence_padded, sequence_length = _pad_sequences(sequences,
                                            pad_tok, max_length)

    elif nlevels == 2:
        max_length_word = max([max(map(lambda x: len(x), seq))
                               for seq in sequences])
        sequence_padded, sequence_length = [], []
        for seq in sequences:
            # all words are same length now
            sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
            sequence_padded += [sp]
            sequence_length += [sl]

        max_length_sentence = max(map(lambda x : len(x), sequences))
        sequence_padded, _ = _pad_sequences(sequence_padded,
                [pad_tok]*max_length_word, max_length_sentence)
        sequence_length, _ = _pad_sequences(sequence_length, 0,
                max_length_sentence)

    return sequence_padded, sequence_length

In [12]:
train_char_ids_padded, train_word_lengths_padded = pad_sequences(train_chars, 0, nlevels = 2)

In [13]:
train_word_ids_padded, train_sequence_lengths_padded = pad_sequences(train_words, 0, nlevels = 1)

In [14]:
train_tag_ids_padded, train_tag_lengths_padded = pad_sequences(train_tags, 0, nlevels = 1)

In [15]:
feed = {
    'word_ids': np.array(train_word_ids_padded),
    "sequence_lengths": np.array(train_sequence_lengths_padded),
    "char_ids": np.array(train_char_ids_padded),
    "word_lengths": np.array(train_word_lengths_padded),
    "labels": np.array(train_tag_ids_padded)
}

In [16]:
print(feed['word_ids'].shape)
print(feed['sequence_lengths'].shape)
print(feed['char_ids'].shape)
print(feed['word_lengths'].shape)
print(feed['labels'].shape)

(128, 41)
(128,)
(128, 41, 17)
(128, 41)
(128, 41)


### Model

#### Word Representation

In [17]:
tf.reset_default_graph()

nwords = len(word_to_id)
word_length = 10
dim_word = 300

nchars = len(char_to_id)
dim_char = 100
char_hidden_size = 100

hidden_size = 300

ntags = len(tag_to_id)

In [18]:
with tf.variable_scope("words"):
    # shape = (batch size, max length of sentence in batch)
    word_ids = tf.placeholder(tf.int32, shape=[None, None], name="word_ids")

    # shape = (batch size)
    sequence_lengths = tf.placeholder(tf.int32, shape=[None], name="sequence_lengths")

    # word_embeddings = tf.Variable(embeddings, dtype=tf.float32, trainable=False)
    word_embedding = tf.get_variable(name="word_embeddings", dtype=tf.float32,
                            shape=[nwords, dim_word])
    # shape = (batch, sentence, word_vector_size)
    word_rep = tf.nn.embedding_lookup(word_embedding, word_ids)

In [19]:
with tf.variable_scope("chars"):
    # shape = (batch size, max length of sentence, max length of word)
    char_ids = tf.placeholder(tf.int32, shape=[None, None, None], name="char_ids")

    # shape = (batch_size, max_length of sentence)
    _word_lengths = tf.placeholder(tf.int32, shape=[None, None], name="word_length")


    # 1. get character embeddings
    char_embedding = tf.get_variable(name="char_embeddings", dtype=tf.float32,
        shape=[nchars, dim_char])
    # shape = (batch, sentence, word, dim of char embeddings)
    char_embeddings = tf.nn.embedding_lookup(char_embedding, char_ids)

    # 2. put the time dimension on axis=1 for dynamic_rnn
    s = tf.shape(char_embeddings) # store old shape
    # shape = (batch x sentence, word, dim of char embeddings)
    char_embeddings = tf.reshape(char_embeddings, shape=[s[0]*s[1], s[-2], dim_char])
    word_lengths = tf.reshape(_word_lengths, shape=[s[0]*s[1]], name="debug2")

    # 3. bi lstm on chars
    cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
    cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)

    _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw,
        cell_bw, char_embeddings, sequence_length=word_lengths,
        dtype=tf.float32)
    # shape = (batch x sentence, 2 x char_hidden_size)
    output = tf.concat([output_fw, output_bw], axis=-1)

    # shape = (batch, sentence, 2 x char_hidden_size)
    char_rep = tf.reshape(output, shape=[-1, s[1], 2*char_hidden_size])

In [20]:
word_embeddings = tf.concat([word_rep, char_rep], axis=-1)

In [22]:
print(word_rep.shape)
print(char_rep.shape)
print(word_embeddings.shape)

(?, ?, 300)
(?, ?, 200)
(?, ?, 500)


#### Contextual Word Reprsentation

In [23]:
with tf.variable_scope("bi-lstm"):
    cell_fw = tf.contrib.rnn.LSTMCell(hidden_size)
    cell_bw = tf.contrib.rnn.LSTMCell(hidden_size)

    (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(cell_fw,
        cell_bw, word_embeddings, sequence_length=sequence_lengths,
        dtype=tf.float32)

    context_rep = tf.concat([output_fw, output_bw], axis=-1)

#### Decoding

In [24]:
with tf.variable_scope("proj"):
    W = tf.get_variable("W", shape=[2*hidden_size, ntags],
                    dtype=tf.float32)

    b = tf.get_variable("b", shape=[ntags], dtype=tf.float32,
                    initializer=tf.zeros_initializer())

    ntime_steps = tf.shape(context_rep)[1]
    context_rep_flat = tf.reshape(context_rep, [-1, 2*hidden_size])
    pred = tf.matmul(context_rep_flat, W) + b
    scores = tf.reshape(pred, [-1, ntime_steps, ntags])

#### Training

In [25]:
# shape = (batch, sentence)
labels = tf.placeholder(tf.int32, shape=[None, None], name="labels")

log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
scores, labels, sequence_lengths)

loss = tf.reduce_mean(-log_likelihood)

In [26]:
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=scores, labels=labels)
# shape = (batch, sentence, nclasses)
mask = tf.sequence_mask(sequence_lengths)
# apply mask
losses = tf.boolean_mask(losses, mask)

loss = tf.reduce_mean(losses)

In [27]:
optimizer = tf.train.AdamOptimizer(0.001)
train_op = optimizer.minimize(loss)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [28]:
n_epochs = 10

init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        sess.run(train_op, feed_dict={word_ids: feed['word_ids'], sequence_lengths: feed['sequence_lengths'], 
                                          char_ids: feed['char_ids'], _word_lengths: feed['word_lengths'],
                                        labels: feed['labels']})
        loss_train = loss.eval(feed_dict={word_ids: feed['word_ids'], sequence_lengths: feed['sequence_lengths'], 
                                          char_ids: feed['char_ids'], _word_lengths: feed['word_lengths'],
                                        labels: feed['labels']})
        print(epoch, "Train loss:", loss_train)

0 Train loss: 1.8488938
1 Train loss: 1.7202827
2 Train loss: 1.4938074
3 Train loss: 1.1238835
4 Train loss: 1.0737103
5 Train loss: 1.1224397
6 Train loss: 1.0353462
7 Train loss: 0.91076684
8 Train loss: 0.8362394
9 Train loss: 0.8340448
