# Recognize named entities on Twitter with LSTMs

For example, we want to extract persons' and organizations' names from the text. Than for the input text:

    Ian Goodfellow works for Google Brain

a NER model needs to provide the following sequence of tags:

    B-PER I-PER    O     O   B-ORG  I-ORG

Where *B-* and *I-* prefixes stand for the beginning and inside of the entity, while *O* stands for out of tag or no tag. Markup with the prefix scheme is called *BIO markup*. This markup is introduced for distinguishing of consequent entities with similar types.


In [1]:
from collections import defaultdict
import numpy as np
import tensorflow as tf
import numpy as np
from evaluation import precision_recall_f1




In [2]:
def read_data(file_path):
    tokens = []
    tags = []
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            
            if token.startswith("http://") or token.startswith("https://"):
                token = "<URL>"
                
            if token.startswith("@"):
                token = "<USR>"
            
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

And now we can load three separate parts of the dataset:
 - *train* data for training the model;
 - *validation* data for evaluation and hyperparameters tuning;
 - *test* data for final evaluation of the model.

In [3]:
train_tokens, train_tags = read_data('data/train.txt')
validation_tokens, validation_tags = read_data('data/validation.txt')
test_tokens, test_tags = read_data('data/test.txt')

In [4]:
print(len(train_tokens), len(validation_tokens), len(test_tokens))

5795 724 724


You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:

In [5]:
for i in range(2):
    for token, tag in zip(train_tokens[i], train_tags[i]):
        print('%s\t\t%s' % (token, tag))
    print()

RT		O
<USR>		O
:		O
Online		O
ticket		O
sales		O
for		O
Ghostland		B-musicartist
Observatory		I-musicartist
extended		O
until		O
6		O
PM		O
EST		O
due		O
to		O
high		O
demand		O
.		O
Get		O
them		O
before		O
they		O
sell		O
out		O
...		O

Apple		B-product
MacBook		I-product
Pro		I-product
A1278		I-product
13.3		I-product
"		I-product
Laptop		I-product
-		I-product
MD101LL/A		I-product
(		O
June		O
,		O
2012		O
)		O
-		O
Full		O
read		O
by		O
eBay		B-company
<URL>		O
<URL>		O



In [6]:
def build_dict(tokens_or_tags, special_tokens):
    """
        tokens_or_tags: a list of lists of tokens or tags
        special_tokens: some special tokens
    """    
    i = 0
    vocab = set([t for ts in tokens_or_tags for t in ts])
    vocab_size = len(vocab)+len(special_tokens)
    
    tok2idx = defaultdict(lambda: 0)    
    idx2tok = {}
    
    for t in special_tokens:
        tok2idx[t] = i
        idx2tok[i] = t
        i +=1
                
    for t_list in tokens_or_tags:
        
        for w in t_list:
            
            if w not in tok2idx:
                tok2idx[w] = i
                idx2tok[i] = w
                i+=1
    
    return tok2idx, idx2tok

After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens in our case will be:
 - `<UNK>` token for out of vocabulary tokens;
 - `<PAD>` token for padding sentence to the same length when we create batches of sentences.

In [7]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create dictionaries
token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags + validation_tags, special_tags)

In [8]:
idx2tag

{0: 'O',
 1: 'B-musicartist',
 2: 'I-musicartist',
 3: 'B-product',
 4: 'I-product',
 5: 'B-company',
 6: 'B-person',
 7: 'B-other',
 8: 'I-other',
 9: 'B-facility',
 10: 'I-facility',
 11: 'B-sportsteam',
 12: 'B-geo-loc',
 13: 'I-geo-loc',
 14: 'I-company',
 15: 'I-person',
 16: 'B-movie',
 17: 'I-movie',
 18: 'B-tvshow',
 19: 'I-tvshow',
 20: 'I-sportsteam'}

In [9]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

In [10]:
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        
        x_list = []
        y_list = []
        max_len_token = 0
        
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        
        yield x, y, lengths

 - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);
 - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);
 - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);
 - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;
 - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.


In [11]:
class BiLSTMModel():
    
    def _declare_placeholders(self):
        """Specifies placeholders for the model."""

        # Placeholders for input and ground truth output.
        self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
        self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape = [None,None], name = "gt_tags")

        # Placeholder for lengths of the sequences.
        self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 

        # Placeholder for a dropout keep probability. If we don't feed
        # a value for this placeholder, it will be equal to 1.0.
        self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])

        # Placeholder for a learning rate (tf.float32).
        self.learning_rate_ph = tf.placeholder(dtype=tf.float32,shape=[])
        
    def _build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
        """Specifies bi-LSTM architecture and computes logits for inputs."""

        initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
        embedding_matrix_variable = tf.Variable(initial_embedding_matrix, dtype=tf.float32)

        forward_cell =  tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(n_hidden_rnn),output_keep_prob = self.dropout_ph, state_keep_prob = self.dropout_ph, input_keep_prob = self.dropout_ph)
        backward_cell =  tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(n_hidden_rnn),output_keep_prob = self.dropout_ph, state_keep_prob = self.dropout_ph, input_keep_prob = self.dropout_ph)

        # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).
        # Shape: [batch_size, sequence_len, embedding_dim].
        embeddings =  tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch)

        # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).
        # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. 
        # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.
        (rnn_output_fw, rnn_output_bw), _ =  tf.nn.bidirectional_dynamic_rnn(forward_cell, backward_cell, embeddings, self.lengths, dtype=tf.float32)
        rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)

        # Dense layer on top.
        # Shape: [batch_size, sequence_len, n_tags].   
        self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)
        
    def _compute_predictions(self):
        """Transforms logits to probabilities and finds the most probable tags."""

        # Create softmax (tf.nn.softmax) function
        softmax_output = tf.nn.softmax(self.logits)

        # Use argmax (tf.argmax) to get the most probable tags
        # Don't forget to set axis=-1
        # otherwise argmax will be calculated in a wrong way
        self.predictions = tf.argmax(softmax_output,axis=-1)

        return
    
    def _compute_loss(self, n_tags, PAD_index):
        """Computes masked cross-entopy loss with logits."""

        # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits_v2)
        ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)
        loss_tensor =  tf.nn.softmax_cross_entropy_with_logits_v2(ground_truth_tags_one_hot,self.logits)

        mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)

        self.loss =  tf.reduce_mean(tf.multiply(mask,loss_tensor))
    
    
    def _perform_optimization(self):
        """Specifies the optimizer and train_op for the model."""

        # Create an optimizer (tf.train.AdamOptimizer)
        self.optimizer =  tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
        self.grads_and_vars = self.optimizer.compute_gradients(self.loss)

        clip_norm = tf.cast(1.0, tf.float32)
        self.grads_and_vars =  [(tf.clip_by_norm(gv[0],clip_norm), gv[1]) for gv in self.grads_and_vars]

        self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)
        
        
    def __init__(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
        self._declare_placeholders()
        self._build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
        self._compute_predictions()
        self._compute_loss(n_tags, PAD_index)
        self._perform_optimization()
        
    def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
    
        feed_dict = {self.input_batch: x_batch,
                     self.ground_truth_tags: y_batch,
                     self.learning_rate_ph: learning_rate,
                     self.dropout_ph: dropout_keep_probability,
                     self.lengths: lengths}

        session.run(self.train_op, feed_dict=feed_dict)
        
        
    def predict_for_batch(self, session, x_batch, lengths):

        feed_dict = {self.input_batch : x_batch,
                     self.lengths : lengths, 
                     self.dropout_ph : 1.0}


        predictions = session.run(self.predictions,feed_dict)


        return predictions

In [12]:
def predict_tags(model, session, token_idxs_batch, lengths):
    """Performs predictions and transforms indices to tokens and tags."""
    
    tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)
    
    tags_batch, tokens_batch = [], []
    for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):
        tags, tokens = [], []
        for tag_idx, token_idx in zip(tag_idxs, token_idxs):
            
            tags.append(idx2tag[tag_idx])
            tokens.append(idx2token[token_idx])
            
        tags_batch.append(tags)
        tokens_batch.append(tokens)
    return tags_batch, tokens_batch
    
    
def eval_conll(model, session, tokens, tags, short_report=True):
    """Computes NER quality measures using CONLL shared task script."""
    
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
        
        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)
        if len(x_batch[0]) != len(tags_batch[0]):
            raise Exception("Incorrect length of prediction for the input, "
                            "expected length: %i, got: %i" % (len(x_batch[0]), len(tags_batch[0])))
        predicted_tags = []
        ground_truth_tags = []
        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): 
            if token != '<PAD>':
                ground_truth_tags.append(idx2tag[gt_tag_idx])
                predicted_tags.append(pred_tag)

        # We extend every prediction and ground truth sequence with 'O' tag
        # to indicate a possible end of entity.
        y_true.extend(ground_truth_tags + ['O'])
        y_pred.extend(predicted_tags + ['O'])
        
    results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)
    return results

## Run your experiment

Create *BiLSTMModel* model with the following parameters:
 - *vocabulary_size* — number of tokens;
 - *n_tags* — number of tags;
 - *embedding_dim* — dimension of embeddings, recommended value: 200;
 - *n_hidden_rnn* — size of hidden layers for RNN, recommended value: 200;
 - *PAD_index* — an index of the padding token (`<PAD>`).

Set hyperparameters. You might want to start with the following recommended values:
- *batch_size*: 32;
- 4 epochs;
- starting value of *learning_rate*: 0.005
- *learning_rate_decay*: a square root of 2;
- *dropout_keep_probability*: try several values: 0.1, 0.5, 0.9.


In [13]:
tf.reset_default_graph()

model = BiLSTMModel(vocabulary_size=len(token2idx), n_tags=len(tag2idx), embedding_dim=200, n_hidden_rnn=200, PAD_index=token2idx['<PAD>'])  


######### YOUR CODE HERE #############

batch_size = 32 
n_epochs = 10 
learning_rate = 0.005 
learning_rate_decay = np.sqrt(2) 
dropout_keep_probability = 0.7

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


Finally, we are ready to run the training!

In [14]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

print('Start training... \n')
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    print('Train data evaluation:')
    eval_conll(model, sess, train_tokens, train_tags, short_report=True)
    print('Validation data evaluation:')
    eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)
    
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

Start training... 

-------------------- Epoch 1 of 10 --------------------
Train data evaluation:
processed 105778 tokens with 4489 phrases; found: 75579 phrases; correct: 191.

precision:  0.25%; recall:  4.25%; F1:  0.48

Validation data evaluation:
processed 12836 tokens with 537 phrases; found: 9257 phrases; correct: 28.

precision:  0.30%; recall:  5.21%; F1:  0.57

-------------------- Epoch 2 of 10 --------------------
Train data evaluation:
processed 105778 tokens with 4489 phrases; found: 2687 phrases; correct: 477.

precision:  17.75%; recall:  10.63%; F1:  13.29

Validation data evaluation:
processed 12836 tokens with 537 phrases; found: 203 phrases; correct: 55.

precision:  27.09%; recall:  10.24%; F1:  14.86

-------------------- Epoch 3 of 10 --------------------
Train data evaluation:
processed 105778 tokens with 4489 phrases; found: 4418 phrases; correct: 2385.

precision:  53.98%; recall:  53.13%; F1:  53.55

Validation data evaluation:
processed 12836 tokens with 53


## Full evaluation report

In [15]:
print('-' * 20 + ' Train set quality: ' + '-' * 20)
train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)

print('-' * 20 + ' Validation set quality: ' + '-' * 20)
validation_results = eval_conll(model, sess, validation_tokens, validation_tags, short_report=False)

print('-' * 20 + ' Test set quality: ' + '-' * 20)
test_results = eval_conll(model, sess, test_tokens, test_tags, short_report=False)

-------------------- Train set quality: --------------------
processed 105778 tokens with 4489 phrases; found: 4523 phrases; correct: 4430.

precision:  97.94%; recall:  98.69%; F1:  98.31

	     company: precision:   97.97%; recall:   97.51%; F1:   97.74; predicted:   640

	    facility: precision:   95.96%; recall:   98.41%; F1:   97.17; predicted:   322

	     geo-loc: precision:   98.90%; recall:   99.50%; F1:   99.20; predicted:  1002

	       movie: precision:   89.19%; recall:   97.06%; F1:   92.96; predicted:    74

	 musicartist: precision:   98.71%; recall:   99.14%; F1:   98.92; predicted:   233

	       other: precision:   97.02%; recall:   98.81%; F1:   97.91; predicted:   771

	      person: precision:   99.44%; recall:   99.77%; F1:   99.61; predicted:   889

	     product: precision:   99.37%; recall:   99.06%; F1:   99.21; predicted:   317

	  sportsteam: precision:   97.70%; recall:   97.70%; F1:   97.70; predicted:   217

	      tvshow: precision:   82.76%; recall:  

In [16]:
batch_gen = batches_generator(5,train_tokens,train_tags, shuffle=True)

In [17]:
tokens,tags,lens = next(batch_gen)

In [18]:
predicted_tags, tokens = predict_tags(model,sess,tokens,lens)

In [19]:
for i,t in enumerate(tokens):
    print("\nSentence:")
    print(" ".join(t))
    print("\nOriginal:")
    print(" ".join(idxs2tags(tags[i])))
    print("\nPredicted:")
    print(" ".join(predicted_tags[i]))


Sentence:
RT <USR> : The only school I would willing go to tomorrow <URL> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Original:
O O O O O O O O O O O O O O O O O O O O O O O O O O

Predicted:
O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:
&lt; 3 <USR> always brightens up my day . you should follow her and listen to her wonderful music . &lt; 3 <PAD> <PAD> <PAD> <PAD>

Original:
O O O O O O O O O O O O O O O O O O O O O O O O O O

Predicted:
O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:
Friday got me like #FridayFeeling <URL> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Original:
O O O O O O O O O O O O O O O O O O O O O O O O O O

Predicted:
O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:
. <USR> says ALL customers along George Dieter/Rex Baxter now have water . A water main broke Sat . cutting off services to 524 . <USR>

Origi