In [4]:
#!pip install trax==1.3.1

In [5]:
import trax
from trax import layers as tl
import os
import numpy as np
import pandas as pd
import random as rnd

from utils import get_vocab,get_params

#set random seeds to make this notebook easier to replicate
trax.supervised.trainer_lib.init_random_number_generators(33)
print(os.getcwd())

/home/siddarthathentu/Desktop/NLP/LSTM-NER


# Part 1: Exploring the data

We will be using a dataset from Kaggle. The original data consists of four columns, the sentence number, the word, the part of speech of the word, and the tags. A few tags you might expect to see are:

    geo: geographical entity
    org: organization
    per: person
    gpe: geopolitical entity
    tim: time indicator
    art: artifact
    eve: event
    nat: natural phenomenon
    O: filler word


In [6]:
#display original kaggle data
data = pd.read_csv("ner_dataset.csv",encoding="ISO-8859-1")
train_sents = open('data/small/train/sentences.txt',"r").readline()
train_labels = open('data/small/train/labels.txt',"r").readline()
print("SENTENCE : ",train_sents)
print("SENTENCE LABEL :",train_labels)
print("ORIGINAL DATA:\n ",data.head(5))
del(data,train_sents,train_labels)

SENTENCE :  Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

SENTENCE LABEL : O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

ORIGINAL DATA:
      Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


# 1.1 Importing the data
In this part, we will import the preprocessed data and explore it

In [7]:
vocab,tag_map = get_vocab("data/large/words.txt","data/large/tags.txt")
t_sentences,t_labels,t_size = get_params(vocab,tag_map,'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences,v_labels,v_size = get_params(vocab,tag_map,'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences,test_labels,test_size = get_params(vocab,tag_map,'data/large/test/sentences.txt', 'data/large/test/labels.txt')

vocab is a dictionary that translates a word string to a unique number. Given a sentence, you can represent it as an array of numbers translating with this dictionary. The dictionary contains a <PAD> token.

When training an LSTM using batches, all our input sentences must be the same size. To accomplish this, we set the length of your sentences to a certain number and add the generic <PAD> token to fill all the empty spaces.


In [8]:
#vocab translates from a word to a unique number
print("vocab[the] : ",vocab["the"])
#pad token
print("padded token :  ",vocab["<PAD>"])

vocab[the] :  9
padded token :   35180


The tag_map corresponds to one of the possible tags a word can have. Run the cell below to see the possible classes we will be predicting. The prepositions in the tags mean:

    I: Token is inside an entity.
    B: Token begins an entity.



In [9]:
print(tag_map)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


So the coding scheme that tags the entities is a minimal one where B- indicates the first token in a multi-token entity, and I- indicates one in the middle of a multi-token entity. If we had the sentence

"Sharon flew to Miami on Friday"

the outputs would look like:

Sharon B-per
flew   O
to     O
Miami  B-geo
on     O
Friday B-tim

our tags would reflect three tokens beginning with B-, since there are no multi-token entities in the sequence. But if we added Sharon's last name to the sentence:

"Sharon Floyd flew to Miami on Friday"

Sharon B-per
Floyd  I-per
flew   O
to     O
Miami  B-geo
on     O
Friday B-tim

then our tags would change to show first "Sharon" as B-per, and "Floyd" as I-per, where I- indicates an inner token in a multi-token sequence.


In [10]:
#Exploring information about the data
print("Then number of outputs in tag_map :  ",len(tag_map))
#Number of vocavulary tokens including PAD
vocab_size = len(vocab)
print("Number of vocabulary words ; ",vocab_size)
print("The training size is ",t_size)
print("The validation size is ",v_size)
print("An example of first sentence is :\n")
print(t_sentences[0])
print("\nAn example of its corresponding label is ",t_labels[0])

Then number of outputs in tag_map :   17
Number of vocabulary words ;  35181
The training size is  33570
The validation size is  7194
An example of first sentence is :

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]

An example of its corresponding label is  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


We have already encoded each sentence into a tensor by converting it into a number. We also have 16 possible classes, as shown in the tag map.

# 1.2 Data generator¶

In python, a generator is a function that behaves like an iterator. It will return the next item. 

In many AI applications it is very useful to have a data generator. 

In [11]:
def data_generator(batch_size,x,y,pad,shuffle=False,verbose=False):
    
    num_lines = len(x)
    lines_index = [*range(num_lines)]
    
    if shuffle:
        rnd.shuffle(lines_index)
    
    index = 0
    
    while True:
        
        buffer_x = [0]*batch_size
        buffer_y = [0]*batch_size
        
        maxLength = 0
        for i in range(batch_size):
            if index>=num_lines:
                index = 0
                #re-shuffle
                if shuffle:
                    rnd.shuffle(lines_index)
            
            buffer_x[i] = x[lines_index[index]]
            buffer_y[i] = y[lines_index[index]]
            
            maxLength = max(maxLength,len(buffer_x[i]))
            
            index+=1
            
        X = np.full((batch_size,maxLength),pad)
        Y = np.full((batch_size,maxLength),pad)
        
        for i in range(batch_size):
            x_i = buffer_x[i]
            y_i = buffer_y[i]
            
            for j in range(len(x_i)):
                X[i,j] = x_i[j]
                Y[i,j] = y_i[j]
        
        if verbose:print("index = ",index)
        yield((X,Y))
            

In [12]:
batch_size=5
mini_sentences = t_sentences[0:8]
mini_labels = t_labels[0:8]
data_gen = data_generator(batch_size,mini_sentences,mini_labels,vocab["<PAD>"],shuffle=False,verbose=True)
X1,Y1 = next(data_gen)
X2,Y2 = next(data_gen)
print(Y1.shape,X1.shape,Y2.shape,X2.shape)
print(X1[0][:],"\n",Y1[0][:])

index =  5
index =  2
(5, 30) (5, 30) (5, 30) (5, 30)
[    0     1     2     3     4     5     6     7     8     9    10    11
    12    13    14     9    15     1    16    17    18    19    20    21
 35180 35180 35180 35180 35180 35180] 
 [    0     0     0     0     0     0     1     0     0     0     0     0
     1     0     0     0     0     0     2     0     0     0     0     0
 35180 35180 35180 35180 35180 35180]


# Part 2:  Building the model

We will now implement the model. We will be using Google's TensorFlow. our model will be able to distinguish the following:
<table>
    <tr>
        <td>
<img src = 'ner1.png' width="width" height="height" style="width:500px;height:150px;"/>
        </td>
    </tr>
</table>

The model architecture will be as follows: 

<img src = 'ner2.png' width="width" height="height" style="width:600px;height:250px;"/>

Concretely: 

* Use the input tensors you built in our data generator
* Feed it into an Embedding layer, to produce more semantic entries
* Feed it into an LSTM layer
* Run the output through a linear layer
* Run the result through a log softmax layer to get the predicted class for each word.

We won't implement the LSTM unit drawn above. However, we will build the model. 


In [13]:
def NER(vocab_size=35181,d_model=50,tags=tag_map):
    
    model = tl.Serial(
            tl.Embedding(vocab_size,d_model),
            tl.LSTM(n_units=d_model),
            tl.Dense(n_units=len(tags)),
            tl.LogSoftmax()
            )
    
    return model

In [14]:
model = NER()
print(model)

Serial[
  Embedding_35181_50
  LSTM_50
  Dense_17
  LogSoftmax
]


# Part 3:  Train the Model 

This section will train our model.

Before we start, we need to create the data generators for training and validation data. It is important that we mask padding in the loss weights of your data, which can be done using the `id_to_mask` argument of `trax.supervised.inputs.add_loss_weights`.

In [15]:
from trax.supervised import training

rnd.seed(33)

batch_size=64

train_generator = trax.supervised.inputs.add_loss_weights(
                  data_generator(batch_size,t_sentences,t_labels,vocab["<PAD>"],True),
                  id_to_mask=vocab["<PAD>"])

eval_generator = trax.supervised.inputs.add_loss_weights(
                 data_generator(batch_size,v_sentences,v_labels,vocab["<PAD>"],True),
                 id_to_mask=vocab["<PAD>"] )


In [16]:
def train_model(NER,train_generator,eval_generator,train_steps=1,output_dir="/home/siddarthathentu/Desktop/NLP/LSTM-NER/model"):
    
    train_task = training.TrainTask(
                 train_generator,
                 loss_layer = tl.CrossEntropyLoss(),
                 optimizer = trax.optimizers.Adam(0.01))
    
    eval_task = training.EvalTask(
                 labeled_data = eval_generator,
                 metrics = [tl.CrossEntropyLoss(),tl.Accuracy()],
                 n_eval_batches = 10)
    
    
    training_loop = training.Loop(
                 NER,
                 train_task,
                 eval_task = eval_task,
                 output_dir = output_dir)
    
    training_loop.run(n_steps=train_steps)
    
    return training_loop

In [None]:
train_steps=1000
!rm -rf "model/model.pkl.gz"

training_loop = train_model(NER(),train_generator,eval_generator,train_steps)

Step      1: train CrossEntropyLoss |  2.89228582
Step      1: eval  CrossEntropyLoss |  1.91789852
Step      1: eval          Accuracy |  0.66664734
Step    100: train CrossEntropyLoss |  0.60055101
Step    100: eval  CrossEntropyLoss |  0.36314742
Step    100: eval          Accuracy |  0.91259873
Step    200: train CrossEntropyLoss |  0.30090800
Step    200: eval  CrossEntropyLoss |  0.24988867
Step    200: eval          Accuracy |  0.93519678
Step    300: train CrossEntropyLoss |  0.23232746
Step    300: eval  CrossEntropyLoss |  0.21319010
Step    300: eval          Accuracy |  0.94073426
Step    400: train CrossEntropyLoss |  0.19600554
Step    400: eval  CrossEntropyLoss |  0.19388027
Step    400: eval          Accuracy |  0.94635340
Step    500: train CrossEntropyLoss |  0.18234240
Step    500: eval  CrossEntropyLoss |  0.17009229
Step    500: eval          Accuracy |  0.95031501
Step    600: train CrossEntropyLoss |  0.15213504
Step    600: eval  CrossEntropyLoss |  0.16813952


In [1]:
print(train_steps)

NameError: name 'train_steps' is not defined

In [23]:
#Loading the pretrained model
model = NER()
model.init(trax.shapes.ShapeDtype((1,1),dtype=np.int32))

model.init_from_file("model/pre-model.pkl.gz",weights_only=True)

# Part 4:  Compute Accuracy

We will now evaluate in the test set. Previously, we have seen the accuracy on the training set and the validation (noted as eval) set. We will now evaluate on your test set. To get a good evaluation, we will need to create a mask to avoid counting the padding tokens when computing the accuracy. 

In [24]:
#Example of a comparision on a matrix 
a = np.array([1, 2, 3, 4])
a == 2

array([False,  True, False, False])

In [25]:
#generate test data
x, y = next(data_generator(len(test_sentences), test_sentences, test_labels, vocab['<PAD>']))
print("input shapes", x.shape, y.shape)

input shapes (7194, 70) (7194, 70)


In [26]:
#sample preiction
tmp_pred = model(x)
print(type(tmp_pred))
print("Shape of tmp_pred : ",tmp_pred.shape)

<class 'jax.interpreters.xla.DeviceArray'>
Shape of tmp_pred :  (7194, 70, 17)


In [27]:
#print(x[0])
print(tmp_pred[0])

[[-4.48355865e+00 -3.36047173e-01 -3.35761213e+00 ... -6.91771650e+00
  -8.58630657e+00 -7.36349249e+00]
 [-1.18455887e-02 -8.34251595e+00 -9.30842590e+00 ... -1.01982880e+01
  -9.44873428e+00 -9.18822384e+00]
 [-2.06794739e-02 -7.45346165e+00 -1.26838894e+01 ... -1.15326271e+01
  -1.26500635e+01 -1.19806232e+01]
 ...
 [-1.46134377e-01 -6.71558952e+00 -9.38210487e+00 ... -9.09346294e+00
  -8.63788605e+00 -8.78505135e+00]
 [-1.46547794e-01 -6.71508074e+00 -9.38067627e+00 ... -9.09435844e+00
  -8.63772011e+00 -8.78440857e+00]
 [-1.46930695e-01 -6.71461391e+00 -9.37936115e+00 ... -9.09518337e+00
  -8.63756752e+00 -8.78381729e+00]]


Note that the model's prediction has 3 axes: 
- the number of examples
- the number of words in each example (padded to be as long as the longest sentence in the batch)
- the number of possible targets (the 17 named entity tags).

In [28]:
def evaluate_prediction(pred,labels,pad):
    
    outputs = np.argmax(pred,axis=-1)
    mask = 1 - (labels==pad)
    numerator = np.sum(labels==outputs)
    denominator = np.sum(mask)
    print(numerator,denominator)
    acc = numerator/denominator
    
    return acc

In [29]:
accuracy = evaluate_prediction(model(x),y,vocab["<PAD>"])
print("Accuracy of the model = ",accuracy)

149106 156234
Accuracy of the model =  0.9543761281155191


# Part 5:  Testing with our own sentence

In [32]:
def predict(sentence,model,vocab,tag_map):
    
    s = [vocab[token] if token in vocab else vocab["UNK"] for token in sentence.split(' ')]
    
    batch_data = np.ones((1,len(s)))
    
    batch_data[0][:] = s
    
    sentence = np.array(batch_data).astype(int)
    output = model(sentence)
    outputs = np.argmax(output,axis=-1)
    labels = list(tag_map.keys())
    
    pred = []
    
    for i in range(len(outputs[0])):
        idx = outputs[0][i]
        pred_label = labels[idx]
        pred.append(pred_label)
        
    return pred

In [34]:
sentence = "Peter Navarro, the White House director of trade and manufacturing policy of U.S, said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall, though he said it wouldn’t necessarily come"
predictions = predict(sentence,model,vocab,tag_map)

for x,y in zip(sentence.split(' '),predictions):
    if y!='O':
        print(x, "  ",y)

Peter    B-per
Navarro,    I-per
White    B-org
House    I-org
Sunday    B-tim
morning    I-tim
White    B-org
House    I-org
coronavirus    B-tim
fall,    B-tim
