# Creating a Multi Ouput Network

Author: Pierre Nugues


In the lab on named entity recognition, we used the words to predict the parts of speech and the named entities. The network will have one input and two outputs.


## The modules

In [1]:
from keras.models import Model
from keras import layers
from keras import Input
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


# Reading the Corpus

In [2]:
vilde = False
if vilde:
    BASE_DIR = '/home/pierre/Cours/EDAN20/corpus/CoNLL2003/'
else:
    BASE_DIR = '/Users/pierre/Projets/Corpora/CoNLL2003/'


def load_conll2003_en():
    train_file = BASE_DIR + 'NER-data/eng.train'
    dev_file = BASE_DIR + 'NER-data/eng.valid'
    test_file = BASE_DIR + 'NER-data/eng.test'
    column_names = ['form', 'ppos', 'pchunk', 'ner']
    train_sentences = open(train_file).read().strip()
    dev_sentences = open(dev_file).read().strip()
    test_sentences = open(test_file).read().strip()
    return train_sentences, dev_sentences, test_sentences, column_names

### The dictorizer that transforms the CoNLL files into dictionaries

In [3]:
import regex as re

class Token(dict):
    pass

class CoNLLDictorizer:

    def __init__(self, column_names, sent_sep='\n\n', col_sep=' +'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]

In [4]:
train_sentences, dev_sentences, test_sentences, column_names = load_conll2003_en()

conll_dict = CoNLLDictorizer(column_names, col_sep=' +')
train_dict = conll_dict.transform(train_sentences)
print(train_dict[0])
print(train_dict[1])

[{'form': '-DOCSTART-', 'ppos': '-X-', 'pchunk': 'O', 'ner': 'O'}]
[{'form': 'EU', 'ppos': 'NNP', 'pchunk': 'I-NP', 'ner': 'I-ORG'}, {'form': 'rejects', 'ppos': 'VBZ', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'German', 'ppos': 'JJ', 'pchunk': 'I-NP', 'ner': 'I-MISC'}, {'form': 'call', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': 'to', 'ppos': 'TO', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'boycott', 'ppos': 'VB', 'pchunk': 'I-VP', 'ner': 'O'}, {'form': 'British', 'ppos': 'JJ', 'pchunk': 'I-NP', 'ner': 'I-MISC'}, {'form': 'lamb', 'ppos': 'NN', 'pchunk': 'I-NP', 'ner': 'O'}, {'form': '.', 'ppos': '.', 'pchunk': 'O', 'ner': 'O'}]


## Building the sequences

### The function to build the sequences

In [5]:
def build_sequences(corpus_dict, key_x='form', key_y='pos', tolower=True):
    """
    Creates sequences from a list of dictionaries
    :param corpus_dict:
    :param key_x:
    :param key_y:
    :return:
    """
    X = []
    Y = []
    for sentence in corpus_dict:
        x = [word[key_x] for word in sentence]
        y = [word[key_y] for word in sentence]
        if tolower:
            x = list(map(str.lower, x))
        X += [x]
        Y += [y]
    return X, Y

### We build the words and NER tags (the first output)

In [6]:
X_words, Y_ner = build_sequences(train_dict, key_x='form', key_y='ner')
print('First sentence, words', X_words[1])
print('First sentence, NER', Y_ner[1])

First sentence, words ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
First sentence, NER ['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']


### We build the words and POS tags (the second output)

In [7]:
Y_pos, Y_ner = build_sequences(train_dict, key_x='ppos', key_y='ner')
print('First sentence, words', Y_pos[1])
print('First sentence, POS', Y_ner[1])

First sentence, words ['nnp', 'vbz', 'jj', 'nn', 'to', 'vb', 'jj', 'nn', '.']
First sentence, POS ['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']


In [8]:
print('input:', X_words[1])
print('output 1:', Y_pos[1])
print('output 2:', Y_ner[1])

input: ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
output 1: ['nnp', 'vbz', 'jj', 'nn', 'to', 'vb', 'jj', 'nn', '.']
output 2: ['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']


### We now extract the list of unique words, POS, and NER

In [9]:
word_set = sorted(list(set([item for sublist in X_words for item in sublist])))
pos_set = sorted(list(set([item for sublist in Y_pos for item in sublist])))
ner_set = sorted(list(set([item for sublist in Y_ner for item in sublist])))
print(len(word_set))
print(len(pos_set))
print(pos_set)
print(len(ner_set))
ner_set

21010
46
['"', '$', "''", '(', ')', ',', '-x-', '.', ':', 'cc', 'cd', 'dt', 'ex', 'fw', 'in', 'jj', 'jjr', 'jjs', 'ls', 'md', 'nn', 'nnp', 'nnps', 'nns', 'nn|sym', 'pdt', 'pos', 'prp', 'prp$', 'rb', 'rbr', 'rbs', 'rp', 'sym', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wp$', 'wrb']
8


['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']

## Building the indices

In [10]:
idx_word = dict(enumerate(word_set, start=2))
idx_pos = dict(enumerate(pos_set, start=2))
idx_ner = dict(enumerate(ner_set, start=2))
word_idx = {v: k for k, v in idx_word.items()}
pos_idx = {v: k for k, v in idx_pos.items()}
ner_idx = {v: k for k, v in idx_ner.items()}

## Converting the matrices
We convert the matrices into numbers

Before: We have the symbols

In [11]:
print(X_words[1])
print(Y_pos[1])
Y_ner[1]

['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['nnp', 'vbz', 'jj', 'nn', 'to', 'vb', 'jj', 'nn', '.']


['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']

In [12]:
X_words_idx = [list(map(lambda x: word_idx.get(x, 1), x)) for x in X_words]
Y_pos_idx = [list(map(lambda x: pos_idx.get(x, 1), x)) for x in Y_pos]
Y_ner_idx = [list(map(lambda x: ner_idx.get(x, 1), x)) for x in Y_ner]

After: We have the indices

In [13]:
print(X_words_idx[1])
print(Y_pos_idx[1])
Y_ner_idx[1]

[8780, 16385, 9880, 5996, 19360, 5682, 5783, 12212, 125]
[23, 43, 17, 22, 36, 38, 17, 22, 9]


[7, 9, 6, 9, 9, 9, 6, 9, 9]

### We pad the sequences

In [14]:
X_words_idx = pad_sequences(X_words_idx)
Y_pos_idx = pad_sequences(Y_pos_idx)
Y_ner_idx = pad_sequences(Y_ner_idx)

In [15]:
print(X_words_idx[1])
print(Y_pos_idx[1])
Y_ner_idx[1]

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0  8780 16385  9880  5996
 19360  5682  5783 12212   125]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 9, 6, 9, 9, 9,
       6, 9, 9], dtype=int32)

### We create one encodings for the outputs

In [16]:
Y_pos_idx_cat = to_categorical(Y_pos_idx)
Y_ner_idx_cat = to_categorical(Y_ner_idx)

## The graph

### The word input

In [17]:
text_vocabulary_size = len(word_set) + 2
text_input = Input(shape=(None,), dtype='int32', name='text')
embedded_text = layers.Embedding(text_vocabulary_size, 
                                 64, mask_zero=True)(text_input)
encoded_text = layers.LSTM(32, 
                           return_sequences=True)(embedded_text)

### The POS output

In [18]:
pos_vocabulary_size = len(pos_set) + 2
pos_output = layers.Dense(pos_vocabulary_size,
                      activation='softmax',
                  name='pos')(encoded_text)

### The NER output

In [19]:
ner_vocabulary_size = len(ner_set) + 2
ner_output = layers.Dense(ner_vocabulary_size,
                          activation='softmax',
                          name='ner')(encoded_text)

### The Model

In [20]:
model = Model(text_input, [pos_output, ner_output])
model.compile(optimizer='rmsprop',
              loss=['categorical_crossentropy', 
                    'categorical_crossentropy'],
              metrics=['acc'])

### We fit the model

In [21]:
model.fit(X_words_idx, 
          {'pos':Y_pos_idx_cat, 'ner':Y_ner_idx_cat},
          epochs=5, batch_size=128)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fcc4a2b3e90>