# Named Entity Recoginition

1. Design the architecture of a neural network, train it, and test it.
2. Process features and represents them
3. Understand word padding
4. Implement LSTMs
5. Test with your own sentence

In [40]:
import os 
import numpy as np
import pandas as pd
import random as rnd
import trax 
import pickle
import gzip



from utils import get_params, get_vocab
from trax.supervised import training
from trax import layers as tl

In [4]:
# display original kaggle data
data = pd.read_csv("data/ner_dataset.csv", encoding = "ISO-8859-1") 
train_sents = open('data/small/train/sentences.txt', 'r').readline()
train_labels = open('data/small/train/labels.txt', 'r').readline()
print('SENTENCE:', train_sents)
print('SENTENCE LABEL:', train_labels)
print('ORIGINAL DATA:\n', data.head())
del(data, train_sents, train_labels)

SENTENCE: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

SENTENCE LABEL: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

ORIGINAL DATA:
     Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [5]:
vocab, tag_map = get_vocab('data/large/words.txt', 'data/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'data/large/test/sentences.txt', 'data/large/test/labels.txt')


In [8]:
# vocab translates from a word to a unique number
print('vocab["the"]:', vocab["an"])
# Pad token
print('padded token:', vocab['<PAD>'])

vocab["the"]: 134
padded token: 35180


In [9]:
print(tag_map)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


In [10]:
def data_generator(batch_size, x, y, pad, shuffle=False, verbose=False):
    '''
      Input: 
        batch_size - integer describing the batch size
        x - list containing sentences where words are represented as integers
        y - list containing tags associated with the sentences
        shuffle - Shuffle the data order
        pad - an integer representing a pad character
        verbose - Print information during runtime
      Output:
        a tuple containing 2 elements:
        X - np.ndarray of dim (batch_size, max_len) of padded sentences
        Y - np.ndarray of dim (batch_size, max_len) of tags associated with the sentences in X
    '''
    
    # count the number of lines in data_lines
    num_lines = len(x)
    
    # create an array with the indexes of data_lines that can be shuffled
    lines_index = [*range(num_lines)]
    
    # shuffle the indexes if shuffle is set to True
    if shuffle:
        rnd.shuffle(lines_index)
    
    index = 0 # tracks current location in x, y
    while True:
        buffer_x = [0] * batch_size # Temporal array to store the raw x data for this batch
        buffer_y = [0] * batch_size # Temporal array to store the raw y data for this batch
                
        max_len = 0 
        for i in range(batch_size):
            
            if index >= num_lines:
                
                index = 0
                if shuffle:
                    rnd.shuffle(lines_index)
            
            
            buffer_x[i] = x[lines_index[index]]
            buffer_y[i] = y[lines_index[index]]
            
            lenx = len(x[lines_index[index]]) 
            if lenx > max_len:
                max_len = lenx  
            
            index += 1


        # create X,Y, NumPy arrays of size (batch_size, max_len) 'full' of pad value
        X = np.full((batch_size, max_len), pad)
        Y = np.full((batch_size, max_len), pad)

        for i in range(batch_size):
        
            x_i =buffer_x[i]
            y_i = buffer_y[i]
            
            # Walk through each word in x_i
            for j in range(len(x_i)):
                # store the word in x_i at position j into X
                X[i, j] = x_i[j]
                
                # store the label in y_i at position j into Y
                Y[i, j] = y_i[j]
                
        if verbose: print("index=", index)
        yield((X,Y))

In [11]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
dg = data_generator(batch_size, mini_sentences, mini_labels, vocab["<PAD>"], shuffle=False, verbose=True)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

index= 5
index= 2
(5, 30) (5, 30) (5, 30) (5, 30)
[    0     1     2     3     4     5     6     7     8     9    10    11
    12    13    14     9    15     1    16    17    18    19    20    21
 35180 35180 35180 35180 35180 35180] 
 [    0     0     0     0     0     0     1     0     0     0     0     0
     1     0     0     0     0     0     2     0     0     0     0     0
 35180 35180 35180 35180 35180 35180]


### Structure
1. tl.Serial: Combinator that applies layers serially (by function composition).

2. tl.Embedding: Initializes the embedding. In this case it is the dimension of the model by the size of the vocabulary.

3. tl.LSTM:Trax LSTM layer.

4. tl.Dense: A dense layer.

5. tl.LogSoftmax: Log of the output probabilities.

In [24]:
def NER(tags,vocab_size = 35181, model_d = 50):
    model = tl.Serial(
        tl.Embedding(vocab_size,model_d),
        tl.LSTM(model_d),
        tl.Dense(len(tags)),
        tl.LogSoftmax()

    )
    return model
    
        

In [25]:
model = NER(tag_map)
# display your model
print(model)

Serial[
  Embedding_35181_50
  LSTM_50
  Dense_17
  LogSoftmax
]


### Training

In [26]:
# Setting random seed for reproducibility and testing
rnd.seed(33)
batch_size = 50

# Create training data, mask pad id=35180 for training.
train_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

# Create validation data, mask pad id=35180 for training.
eval_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, v_sentences, v_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

In [45]:
def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'):
    '''
    Input: 
        NER - the model you are building
        train_generator - The data generator for training examples
        eval_generator - The data generator for validation examples,
        train_steps - number of training steps
        output_dir - folder to save your model
    Output:
        training_loop - a trax supervised training Loop
    '''
    train_task = training.TrainTask(
      train_generator, # A train data generator
      loss_layer = tl.CrossEntropyLoss(), # A cross-entropy loss function
      optimizer = trax.optimizers.Adam(0.01) # The adam optimizer
    ) 

    eval_task = training.EvalTask(
      labeled_data = eval_generator,  # A labeled data generator
      metrics = [tl.CrossEntropyLoss(), tl.Accuracy()], # Evaluate with cross-entropy loss and accuracy
      n_eval_batches = 10, # Number of batches to use on each evaluation
    )

    training_loop = training.Loop( 
        NER, # A model to train
        train_task, # A train task
        eval_tasks = [eval_task], # The evaluation task
        output_dir = output_dir # The output directory
    )

    # Train with train_steps
    training_loop.run(n_steps = train_steps)
   

    return training_loop

In [46]:
train_steps = 100            # In coursera we can only train 100 steps
!rm -f 'model/model.pkl.gz'  # Remove old model.pkl if it exists

# Train the model
training_loop = train_model(NER(tag_map), train_generator, eval_generator, train_steps)

  pid, fd = os.forkpty()
  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 1780117
Step      1: Ran 1 train steps in 1.14 secs
Step      1: train CrossEntropyLoss |  4.05691528


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  CrossEntropyLoss |  2.95789111
Step      1: eval          Accuracy |  0.01963737


  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step    100: Ran 99 train steps in 19.96 secs
Step    100: train CrossEntropyLoss |  0.60303080


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step    100: eval  CrossEntropyLoss |  0.27933300
Step    100: eval          Accuracy |  0.93120872


In [44]:
# loading in a pretrained model..
model = NER(tag_map)
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))


((Array([[-0.15745609, -0.21091457, -0.20777333, ...,  0.04007557,
          -0.1087181 , -0.08918095],
         [ 0.00036214, -0.16220818, -0.03831241, ..., -0.09247386,
           0.08686876,  0.1546312 ],
         [ 0.14603308,  0.12723622,  0.13791455, ...,  0.03616654,
           0.21975254,  0.19938189],
         ...,
         [ 0.1579755 ,  0.22411539,  0.15102719, ...,  0.00439399,
           0.22644942,  0.06844332],
         [-0.16524605, -0.00113011, -0.12634876, ...,  0.09563256,
           0.04182005, -0.03099705],
         [ 0.21736857,  0.1870471 , -0.03000413, ...,  0.14903527,
           0.00824561, -0.15180111]], dtype=float32),
  (((), ((), ())),
   ((Array([[ 0.01381226, -0.0610578 , -0.03491003, ..., -0.0573127 ,
              0.0134791 ,  0.12583719],
            [ 0.01947761,  0.08709538, -0.1005499 , ..., -0.00982018,
              0.06879852, -0.00971121],
            [-0.03230394,  0.01808858,  0.09263638, ..., -0.12480059,
             -0.03904063,  0.0016865

### Testing

In [47]:
# create the evaluation inputs
x, y = next(data_generator(len(test_sentences), test_sentences, test_labels, vocab['<PAD>']))
print("input shapes", x.shape, y.shape)

input shapes (7194, 70) (7194, 70)


In [48]:
# sample prediction
tmp_pred = model(x)
print(type(tmp_pred))
print(f"tmp_pred has shape: {tmp_pred.shape}")

<class 'jaxlib.xla_extension.ArrayImpl'>
tmp_pred has shape: (7194, 70, 17)


In [49]:
def evaluate_prediction(pred, labels, pad):
    """
    Inputs:
        pred: prediction array with shape 
            (num examples, max sentence length in batch, num of classes)
        labels: array of size (batch_size, seq_len)
        pad: integer representing pad character
    Outputs:
        accuracy: float
    """
    outputs = np.argmax(pred,axis=2)
    print("outputs shape:", outputs.shape)


    mask = labels!=pad
    print("mask shape:", mask.shape, "mask[0][20:30]:", mask[0][20:30])

    accuracy = np.sum(outputs==labels)/float(np.sum(mask))

    return accuracy

In [50]:
accuracy = evaluate_prediction(model(x), y, vocab['<PAD>'])
print("accuracy: ", accuracy)

outputs shape: (7194, 70)
mask shape: (7194, 70) mask[0][20:30]: [ True  True  True False False False False False False False]
accuracy:  0.0003584367


In [51]:
def predict(sentence, model, vocab, tag_map):
    s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
    batch_data = np.ones((1, len(s)))
    batch_data[0][:] = s
    sentence = np.array(batch_data).astype(int)
    output = model(sentence)
    outputs = np.argmax(output, axis=2)
    labels = list(tag_map.keys())
    pred = []
    for i in range(len(outputs[0])):
        idx = outputs[0][i] 
        pred_label = labels[idx]
        pred.append(pred_label)
    return pred

In [54]:

sentence = "Naruto is a Japanese manga series written and illustrated by Masashi Kishimoto. It tells the story of Naruto Uzumaki, a young ninja who seeks recognition from his peers and dreams of becoming the Hokage, the leader of his village. The story is told in two parts: the first is set in Naruto's pre-teen years (volumes 1–27), and the second in his teens (volumes 28–72). The series is based on two one-shot manga by Kishimoto: Karakuri (1995), which earned Kishimoto an honorable mention in Shueisha's monthly Hop Step Award the following year, and Naruto"
s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
predictions = predict(sentence, model, vocab, tag_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)

Naruto I-org
is B-nat
a B-nat
Japanese B-nat
manga B-nat
series B-art
written B-art
and B-art
illustrated B-art
by B-art
Masashi B-art
Kishimoto. B-art
It B-art
tells B-art
the B-art
story B-art
of B-art
Naruto B-art
Uzumaki, B-art
a B-art
young B-art
ninja B-art
who B-nat
seeks B-art
recognition B-art
from B-art
his B-art
peers B-art
and B-art
dreams B-art
of B-art
becoming B-art
the B-art
Hokage, B-art
the B-art
leader B-art
of B-art
his B-art
village. B-art
The B-art
story B-art
is B-art
told B-art
in B-art
two B-art
parts: B-art
the B-art
first B-art
is B-art
set B-art
in B-art
Naruto's B-art
pre-teen B-art
years B-art
(volumes B-art
1–27), B-art
and B-art
the B-art
second B-art
in B-art
his B-art
teens B-art
(volumes B-art
28–72). B-art
The B-art
series B-art
is B-art
based B-art
on B-art
two B-art
one-shot B-art
manga B-art
by B-art
Kishimoto: B-art
Karakuri B-art
(1995), B-art
which B-art
earned B-art
Kishimoto B-art
an B-art
honorable B-art
mention B-art
in B-art
Shueisha's B-a