#### **RNN for Named Entity Recognition**

The NLP task [Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) (NER) is to classify named entity's within a corpus into predefined categories.

We explore the use of a recurrent architecture to perform NER. The dataset used is the MIT-Restaurants dataset. The tagging of the dataset is in the [IOB2](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) format.

Note: Framework based off https://github.com/lingo-mit/6864-hw2/blob/master/6864_hw2.ipynb

In [0]:
import numpy as np

import torch
import torch.nn as nn

import util


device = "cuda" if torch.cuda.is_available() else "cpu"
assert device == "cuda" # use a gpu!

#### **Setup**

Read in the dataset

In [3]:
train_data = util.read_file_txt("../data/mit_restaurants-train.dat", type = "word")
train_tags = util.read_file_txt("../data/mit_restaurants-train.tag", type = "word")

test_data = util.read_file_txt("../data/mit_restaurants-test.dat", type = "word")
test_tags = util.read_file_txt("../data/mit_restaurants-test.tag", type = "word")

print('number of training samples:', len(train_data))
print('number of testing samples:',  len(test_data))
# print('average sentence length in training data', (np.mean([len(sent) for sent in train_data])))
print()

print('the first few sentences are:', train_data[0:3])
print('and their cp\'ding named entity sequences are: ', str(train_tags[0:3]))
print()

number of training samples: 7660
number of testing samples: 1521

the first few sentences are: [['2', 'start', 'restaurants', 'with', 'inside', 'dining'], ['34'], ['5', 'star', 'resturants', 'in', 'my', 'town']]
and their cp'ding named entity sequences are:  [['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity'], ['O'], ['B-Rating', 'I-Rating', 'O', 'B-Location', 'I-Location', 'I-Location']]



#### **Data Preprocessing**

In [4]:
# helper functions and more data preprocessing before we move on to implementing our models.

# from train data, collect all unique word types as a set and add 'UNK' to it (unseen words in test data will be turned into 'UNK')
vocab_set = list(set([word for sent in train_data for word in sent])) + ['UNK']
num_vocabs = len(vocab_set)
print("number of word types (including 'UNK'):", num_vocabs)
print("the first couple and last couple of words in the vocabulary set:", vocab_set[0:2] +  vocab_set[-2:])

vocab2id = {v : i for i, v in enumerate(vocab_set)}

#  collect all tag (class) types and assign an unique id to each of them. (here there won't be a unseen tag type in test data)
tag_set = list(set([tag for tag_seq in train_tags for tag in tag_seq]))
num_tags = len(tag_set)
print("number of tag types:", num_tags)
print()

# assign each tag type a unique id, also create the inverse dict of tag2id (required during evaluation)
tag2id = {t : i for i, t in enumerate(tag_set)} 
id2tag = {i : t for t, i in tag2id.items()}

# apply one-hot encoding to data.
train_data_oh_list = [util.one_hot_encoding(sent, vocab2id, vocab_set) for sent in train_data]
# print("oh data[0] - len:", len(train_data_oh_list[0]), "shape:", train_data_oh_list[0].shape)

# transform tag names into ids
train_tags_id_list = [util.encoding_idx(tag_seq, tag2id) for tag_seq in train_tags]
# print("list len, tag data:", len(train_tags_id_list))

# train_data_oh_list should now be a list of 2d-tensors, each has shape (sent_len, num_vocabs)
# Note that to utilize the `shape` attribute, each element in the list should already be a torch tensor.
print("first sentence has shape: %s" % str(train_data_oh_list[0].shape))
print("fifth sentence has shape: %s" % str(train_data_oh_list[4].shape))

# train_tags_id_list is a list of 1d-tensors, each that has shape (sent_len,)
print("first tag sequence has shape: %s" % train_tags_id_list[0].shape)
print("fifth tag sequence has shape: %s" % train_tags_id_list[4].shape)
print()


# Apply same conversion to test dataset.
test_data_oh_list = [util.one_hot_encoding(sent, vocab2id, vocab_set) for sent in test_data]
test_tags_id_list = [util.encoding_idx(tag_seq, tag2id) for tag_seq in test_tags]
# print("list len, oh test:", len(test_data_oh_list))
# print("list len, tag test:", len(test_tags_id_list))

number of word types (including 'UNK'): 3805
the first couple and last couple of words in the vocabulary set: ['nouvelle', 'kfc', 'katachi', 'UNK']
number of tag types: 17

first sentence has shape: torch.Size([6, 3805])
fifth sentence has shape: torch.Size([12, 3805])
first tag sequence has shape: 6
fifth tag sequence has shape: 12



#### **RNN**

We implement a vanilla RNN from scratch, then train it and evaluate its performance on the NER task.

##### **RNN architecture**

In [0]:
class RNN(nn.Module):
    # A torch module implementing an RNN. The `forward` function should just
    # perform one step of update and output logits before softmax.

    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        # `input_size`, `hidden_size`, and `output_size` are all int.

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.hidden = nn.Linear(self.input_size + self.hidden_size, self.hidden_size)
        self.output = nn.Linear(self.hidden_size, self.output_size)
        

    def forward(self, x, hidden):
        # `x` is a 2d-tensor of shape (1, input_size); `hidden` is another
        # 2d-tensor of shape (1, hidden_size), representing the hidden state of
        # the previous time step.

        # output = torch.zeros( (1, self.output_size))        
        # print(x.shape, hidden.shape)
        combined = torch.cat( (x, hidden), dim = 1)
        hidden = self.hidden(combined)
        output = self.output(hidden)
        
        # print(output.shape, hidden.shape)

        return output, hidden
    

    def initHidden(self):
        # Use to initialize hidden state everytime before running a sentence.
        return torch.zeros(1, self.hidden_size)

##### **Training**

In [0]:



# Now that you have defined your RNN model, we can start training it.
# We've provided the main training loop, but you will have to implement the fucntion `rnn_train_one_sample`,
# which takes a (sentence-tensor, tag-tensor)-pair as input and does one step of gradient update.
# To understand better what this function is supposed to do, you can go over the main training loop in the next section first.


learning_rate = 1e-3
rnn_hidden_size = 128


rnn_model = RNN(input_size = num_vocabs, hidden_size = rnn_hidden_size, output_size = num_tags).to(device)
 
criterion = nn.CrossEntropyLoss()
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr = learning_rate)


# Run through a sentence, generate output, compute loss, and perform one gradient update
# Sentence and tag are represented as a 2d-tensor `sent_tensor` and a 1d-tensor `tag_tensor`, respectively.
def rnn_train_one_sample(model, sent_tensor, tag_tensor):
    hidden = model.initHidden().to(device)     # initialize hidden state
    loss = 0.00 # torch.zeros( (1, 1))

    outputs = torch.zeros( (1, num_tags))
    
    for idx in range(sent_tensor.shape[0]):
        outputs, hidden = model(sent_tensor[idx].reshape(1, sent_tensor.shape[1]), hidden)
        loss = loss + criterion(outputs, torch.LongTensor([tag_tensor[idx]]))
        
    loss = loss / len(tag_tensor)   # average the loss over all tags in the sentance

    rnn_optimizer.zero_grad()
    loss.backward()
    rnn_optimizer.step()

    return outputs, loss.item()


# main training loop for the rnn
import time
import math


n_epochs = 2 # 5
iter_count = 0
print_every = 100 # 1000
plot_every = 50

# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()


rnn_model.train()
 
for epoch_i in range(n_epochs):
    for sent_tensor, tag_tensor in zip(train_data_oh_list, train_tags_id_list):
        sent_tensor = sent_tensor.to(device)
        tag_tensor = tag_tensor.to(device)
    
        output, loss = rnn_train_one_sample(rnn_model, sent_tensor, tag_tensor)
        current_loss += loss
  
        if iter_count % print_every == 0:
            print('%d %d %s %.4f' % (n_epochs, iter_count, timeSince(start), loss)) # print('%d %s %.4f' % (iter_count, timeSince(start), loss))
  
        # add current loss avg to list of losses
        if iter_count % plot_every == 0 and iter_count > 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0
  
        iter_count += 1




##### **Evaluation**

In [0]:
# plot the learning curve. The x-axis is the training iterations and the y-axis is the training loss. The loss should be going down.
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# plt.figure()
# plt.plot(all_losses)


# evaluation / inference
import sklearn
from sklearn.metrics import precision_recall_fscore_support


# Example: true_tag_list/predicted_tag_list:
#   [[‘O’, ‘O’, ‘I’, ‘N’, ...]
#    [‘I’, ‘I’, ‘O’, ‘N’, ...]],
# each sublist corresponds to an input sentence.
def evaluate_result(true_tag_list, predicted_tag_list):
    p_list = []
    r_list = []
    f1_list = []
    
    for true_tag, predicted_tag in zip(true_tag_list, predicted_tag_list):
        p, r, f1, _ = precision_recall_fscore_support(true_tag, predicted_tag,
                                                      average='macro',
                                                      zero_division=0)
        p_list.append(p)
        r_list.append(r)
        f1_list.append(f1)
    
    return np.mean(p_list), np.mean(r_list), np.mean(f1_list)


# Make prediction for one sentence.
def rnn_predict_one_sent(model, sent_tensor):
    hidden = model.initHidden().to(device)
 
    predicted_tag_id = None
    # Your code here!
 
    return predicted_tag_id
 
 
rnn_model.eval()
predicted_tags = []
 
for sent_tensor in test_data_oh_list:
    sent_tensor = sent_tensor.to(device)
    predicted_tag_id = rnn_predict_one_sent(rnn_model, sent_tensor)
    predicted_tags.append([id2tag[idx] for idx in predicted_tag_id.detach().cpu().numpy()])
  
   
# precision, recall, and f1 score.
evaluate_result(test_tags, predicted_tags)