In [None]:
!pip install -q -U trax

[K     |████████████████████████████████| 368kB 16.8MB/s 
[K     |████████████████████████████████| 2.6MB 49.9MB/s 
[K     |████████████████████████████████| 1.5MB 48.3MB/s 
[K     |████████████████████████████████| 163kB 56.2MB/s 
[K     |████████████████████████████████| 81kB 10.9MB/s 
[K     |████████████████████████████████| 983kB 47.5MB/s 
[K     |████████████████████████████████| 5.3MB 49.0MB/s 
[K     |████████████████████████████████| 368kB 40.4MB/s 
[K     |████████████████████████████████| 358kB 56.8MB/s 
[K     |████████████████████████████████| 194kB 43.8MB/s 
[K     |████████████████████████████████| 655kB 49.5MB/s 
[K     |████████████████████████████████| 307kB 52.5MB/s 
[K     |████████████████████████████████| 1.1MB 50.1MB/s 
[K     |████████████████████████████████| 778kB 49.9MB/s 
[K     |████████████████████████████████| 3.5MB 46.2MB/s 
[K     |████████████████████████████████| 51kB 7.2MB/s 
[K     |████████████████████████████████| 235kB 54.8MB/s 


In [None]:
import pandas as pd
import numpy as np
import csv
import trax
import trax.fastmath.numpy as fastnp
from trax import layers as tl

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


In [None]:
with open('ner_dataset.csv','r',encoding='ISO-8859-1') as file:
  data = csv.reader(file)
  next(data)
  sentences = []
  tags = []
  text = ""
  label = ""
  for row in data:
    if row[0]:
      if text and label:
        sentences.append(text)
        tags.append(label)
      text = row[1]
      label = row[3]
    else:
      text += " "+row[1]
      label += " "+row[3]
  if text and label:
    sentences.append(text)
    tags.append(label)
   

In [None]:
len(sentences)

47959

In [None]:
len(tags)

47959

In [None]:
with open('listfile.txt', 'w') as filehandle:
    filehandle.writelines("%s\n" % tag for tag in tags)

In [None]:
vocab = {"__UNK__":0}
for line in sentences:
  for word in line.split():
    if word not in vocab:
      vocab[word] = len(vocab)

vocab['<PAD>'] = len(vocab)
print(vocab['<PAD>'])

35177


In [None]:
len(vocab)

35178

In [None]:
tags_map = {}
for line in tags:
  for tag in line.split():
    if tag not in tags_map:
      tags_map[tag] = len(tags_map)

In [None]:
tags_map

{'B-art': 8,
 'B-eve': 14,
 'B-geo': 1,
 'B-gpe': 2,
 'B-nat': 13,
 'B-org': 5,
 'B-per': 3,
 'B-tim': 7,
 'I-art': 9,
 'I-eve': 15,
 'I-geo': 4,
 'I-gpe': 11,
 'I-nat': 16,
 'I-org': 6,
 'I-per': 10,
 'I-tim': 12,
 'O': 0}

In [None]:
def text_to_tensor(text,unk = "__UNK__"):
  tensor = []
  unknown = vocab[unk]
  for word in text.split():
    tensor.append(vocab.get(word,unknown))
  return tensor

In [None]:
def tags_to_tensor(tags):
  tensor = []
  for tag in tags.split():
    tensor.append(tags_map.get(tag,0))
  return tensor

In [None]:
tx = 'O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O'
print(tags_to_tensor(tx))

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


In [None]:
train_len = int(len(sentences)*0.7)
eval_len = int(len(sentences)*0.2)
test_len = int(len(sentences)*0.1)
train_x = sentences[:train_len]
train_y = tags[:train_len]
eval_x = sentences[train_len:train_len+eval_len]
eval_y = tags[train_len:train_len+eval_len]
test_x = sentences[train_len+eval_len:]
test_y = tags[train_len+eval_len:]

In [None]:
print(len(train_x),len(train_y))
print(len(eval_x),len(eval_y))
print(len(test_x),len(test_y))

33571 33571
9591 9591
4797 4797


In [None]:
import random
def data_generator(texts,tags,batch_size,pad,shuffle = True):
  index = 0
  data_len = len(texts)
  index_lines = list(range(data_len))
  if shuffle:
    random.shuffle(index_lines)
  
  X = []
  Y = []

  while True:
    if index >= data_len:
      index = 0
      if shuffle:
        random.shuffle(index_lines)
    txt = texts[index_lines[index]]
    X.append(text_to_tensor(txt))
    tg = tags[index_lines[index]]
    Y.append(tags_to_tensor(tg))
    index += 1

    if len(X) == batch_size:
      batch_x = []
      batch_y = []
      mask = []
      max_len = max([len(x) for x in X])
      for i in range(batch_size):
        temp_x = X[i] + [pad]*(max_len-len(X[i]))
        temp_y = Y[i] + [pad]*(max_len-len(Y[i]))
        emple_mask = [1 if x!=pad else 0 for x in temp_y]
        batch_x.append(temp_x)
        batch_y.append(temp_y)
        mask.append(emple_mask)
      final_X = fastnp.array(batch_x)
      final_y = fastnp.array(batch_y)
      final_mask = fastnp.array(mask)

      yield final_X,final_y,final_mask
      X = []
      Y = []




In [None]:
#test
batch = next(data_generator(train_x,train_y,2,vocab['<PAD>']))
batch[0],batch[1],batch[2]

(DeviceArray([[  368,  8960, 14303,    29,    60, 10261,   878,    14,
                 326,   192,   150,  1286,   170,  4274,  4275,    79,
                4899,    12,  1822,  1823,    22, 35177, 35177, 35177,
               35177, 35177, 35177],
              [  596,   173,  3869,    69,    70,   306,   294,  1862,
                   8,   467,   510,    20,    59,  2207,    36,  7569,
                8000,   121,   357,    14,    92,  3152,   226,   752,
                3869,    22,    36]], dtype=int32),
 DeviceArray([[    0,     0,     0,     0,     0,     0,     0,     0,
                   0,     0,     5,     3,    10,    10,    10,     0,
                   0,     0,     0,     0,     0, 35177, 35177, 35177,
               35177, 35177, 35177],
              [    0,     0,     1,     0,     0,     0,     0,     0,
                   0,     0,     0,     0,     0,     0,     0,     0,
                   0,     0,     0,     0,     0,     0,     0,     0,
                   1, 

In [None]:
import itertools
batch_size = 64
train_generator = itertools.cycle(data_generator(train_x,train_y,batch_size,vocab['<PAD>']))
eval_generator = itertools.cycle(data_generator(eval_x,eval_y,batch_size,vocab['<PAD>']))

In [None]:
def NER_model(vocab_size = 35178,embed_size = 128,len_tags = 17):
  model = tl.Serial(tl.Embedding(vocab_size,embed_size),tl.LSTM(embed_size),tl.Dense(len_tags),tl.LogSoftmax())
  return model


In [None]:
from trax.supervised import training

In [None]:
def train_model(model,train_generator,eval_generator,n_steps=1,output_dir = "/ner_model"):
  train_task = training.TrainTask(train_generator,tl.CrossEntropyLoss(),trax.optimizers.Adam(0.01))
  eval_task = training.EvalTask(eval_generator,[tl.CrossEntropyLoss(),tl.Accuracy()],n_eval_batches=3)
  train_loop = training.Loop(model,train_task,eval_tasks=[eval_task],output_dir=output_dir)
  train_loop.run(n_steps)
  return train_loop

In [None]:
loop = train_model(NER_model(),train_generator,eval_generator,500)


Step    500: Ran 100 train steps in 35.56 secs
Step    500: train CrossEntropyLoss |  0.13881032
Step    500: eval  CrossEntropyLoss |  0.13752663
Step    500: eval          Accuracy |  0.95853502

Step    600: Ran 100 train steps in 13.28 secs
Step    600: train CrossEntropyLoss |  0.13914625
Step    600: eval  CrossEntropyLoss |  0.16961588
Step    600: eval          Accuracy |  0.94559435

Step    700: Ran 100 train steps in 10.63 secs
Step    700: train CrossEntropyLoss |  0.13751015
Step    700: eval  CrossEntropyLoss |  0.17699127
Step    700: eval          Accuracy |  0.94573482

Step    800: Ran 100 train steps in 13.73 secs
Step    800: train CrossEntropyLoss |  0.13336383
Step    800: eval  CrossEntropyLoss |  0.14042162
Step    800: eval          Accuracy |  0.95812070

Step    900: Ran 100 train steps in 5.74 secs
Step    900: train CrossEntropyLoss |  0.13204646
Step    900: eval  CrossEntropyLoss |  0.14564398
Step    900: eval          Accuracy |  0.95884267


In [None]:
model = NER_model()
model.init_from_file('/ner_model/model.pkl.gz')

In [None]:
test_gen = next(data_generator(test_x,test_y,4,vocab['<PAD>']))
pred = model(test_gen[0])
print((fastnp.argmax(pred,axis = -1)).shape)

(4, 35)


In [None]:
def compute_accuracy(pred,targets,mask):
  predictions = fastnp.argmax(pred,axis = -1)
  correct = fastnp.sum(predictions == targets)
  total = float(np.sum(mask))
  return correct/total


In [None]:
test_generator = next(data_generator(test_x,test_y,len(test_x),vocab['<PAD>']))
pred = model(test_generator[0])
print(compute_accuracy(pred,test_generator[1],test_generator[2]))

0.95596594


In [None]:
def predict(model,sentence):
  processed_text = fastnp.array(text_to_tensor(sentence))
  pred = model(processed_text[None,:])
  predict = fastnp.argmax(pred,axis = -1)
  labels = list(tags_map.keys())
  for i in predict[0]:
    print(labels[i])


In [None]:
sentence = "Peter Navarro, the White House director of trade and manufacturing policy of U.S, said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall, though he said it wouldn’t necessarily come"
predict(model,sentence)

B-per
I-per
O
B-org
I-org
O
O
O
O
O
O
O
O
O
O
O
O
O
B-tim
I-tim
O
O
B-org
I-org
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
O
