# Assignment 4, task 1

In this task, we will have a final crack at the NER problem, using recurrent neural networks, or Gated Recurrent Units (GRUs) to be more exact.

We want to consider both the context of the word (the surrounding words) and the contents of the word (the letters and other symbols that make up the actual word). Therefore we are using two bi-directional GRUs, one word-level GRU for the words in the sentence, and one character-level GRU for the letters and other symbols in a word. 

We will process one sentence at a time. Each hidden state vector in the word-level GRU represents that word in relation to the other words in the sentence, whereas the final state vector(s) in the character-level RNN represent morphological and typographical information about the word. We will concatenate these vectors to obtain a single information-rich representation of the word.

In [21]:
# First run this cell
import csv
from tqdm import tqdm
import string
import codecs
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

In [None]:
# Run this cell to init mappings from characters to IDs and back again,
# from words to IDs and back again, and from labels to IDs and back again

UNKNOWN = '<UNK>'  # Unknown char or unknown word
CHARS = [UNKNOWN, '’', '—'] + list(string.punctuation) + list(string.ascii_letters) + list(string.digits)
PADDING_WORD = '<PAD>'
#CHARS.append(PADDING_WORD) #[PY: ADDED PAD to char_to_id]
char_to_id = {c:i for i,c in enumerate(CHARS)}
id_to_label = ['noname', 'name']


def label_to_id(label):
    return 0 if label == 'O' else 1

We want to have a vector representation of the syntactic and semantic properties of words, and in order to avoid having to train these from scratch, we are going to re-use pre-trained Glove vectors.

In [23]:
def load_glove_embeddings(embedding_file,
                          padding_word=PADDING_WORD, 
                          unknown_word=UNKNOWN):
    """
    Reads Glove embeddings from a file.

    Returns vector dimensionality, the word_to_id mapping (as a dict),
    and the embeddings (as a list of lists).
    """
    word_to_id = {}  # Dictionary to store word-to-ID mapping
    word_to_id[padding_word] = 0
    word_to_id[unknown_word] = 1
    embeddings = []
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            data = line.split()
            word = data[0]                     ## this is the word itself which is read from the embedding file
            vec = [float(x) for x in data[1:]]  ## slicing operation: gives the embedding vector associated with the word; converts to a list of floats
            embeddings.append(vec)
            #Assigns a unique ID to the word in the dictionary, word_to_id; len gives the "next available ID" 
            #if it's a new word, otherwise it gives existing ID
            word_to_id[word] = len(word_to_id) 
    D = len(embeddings[0])
    print(f"D is {D}")
    embeddings.insert(word_to_id[padding_word], [0]*D)  # <PAD> has an embedding of just zeros
    embeddings.insert(word_to_id[unknown_word], [-1]*D)      # <UNK> has an embedding of just minus-ones

    return D, word_to_id, embeddings


We can now create our dataset. Each datapoint will consist of a sentence and its associated labels for each word in the sentence. The label is either 1 (a name) or 0 (not a name). 

In [24]:
class NERDataset(Dataset):
    """
    A class loading NER dataset from a CSV file to be used as an input 
    to PyTorch DataLoader.

    The CSV file has 4 fields: sentence number (only at the start of a new
    sentence), word, POS tag (ignored), and label.

    Datapoints are sentences + associated labels for each word. If the 
    words have not been seen before (i.e, they are not found in the 
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    """

    def __init__(self, filename, word_to_id):
        reader = csv.reader(codecs.open(filename, encoding='ascii',
                                        errors='ignore'), delimiter=',')

        self.sentences = []
        self.labels = []

        sentence, labels = [], []
        for row in reader:
            if row:
                if row[0].strip(): # A new sentence begins  -- we got a sentence number!
                    if sentence and labels:
                        self.sentences.append(sentence)
                        self.labels.append(labels)
                    sentence = [row[1].strip()]
                    labels = [label_to_id(row[3].strip())]
                else:            #else we are simply continuing with part of the previous sentence
                    sentence.append(row[1].strip())
                    labels.append(label_to_id(row[3].strip()))

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.labels[idx]

In [25]:
# Let's check out some of these data structures
dim, word_to_id, embeddings = load_glove_embeddings('/datasets/dd2417/glove.6B.50d.txt')
print("The embedding for the word 'smart' looks like this:")
print(embeddings[word_to_id['smart']])
print()

# Read the data we are going to use for testing the model
test_set = NERDataset('/datasets/dd2417/ner_test.csv', word_to_id)
print("There are", len(test_set), "data points in the testset")
dp = 1600
sentence, labels = test_set[dp]
print("Data point", dp, "is", sentence)
print("It has the labels", labels)

D is 50
The embedding for the word 'smart' looks like this:
[-0.051342, -0.28753, 0.82755, 0.12722, 0.46844, 0.0038295, -0.4318, -1.1316, -0.28952, 0.60025, -0.32302, 0.3647, -0.26373, 0.41625, -0.35763, 0.36209, 0.13728, 1.1499, 0.24038, -0.71581, 0.093982, 0.68847, -0.61259, 0.65653, 0.62681, -1.2954, -0.4377, -0.12567, 0.76713, -0.61208, 2.0342, 0.2605, -0.19958, -0.1012, 0.55044, 0.13263, -0.086022, 0.63799, -0.61907, -0.84927, 0.71463, 0.33314, 0.082519, 0.23978, 0.6821, 0.087969, 0.36122, -0.043632, 0.60344, 1.5803]

There are 4542 data points in the testset
Data point 1600 is ['Isolated', 'grass', 'fires', 'continue', 'to', 'burn', 'in', 'the', 'southern', 'U.S.', 'states', 'of', 'Oklahoma', 'and', 'Texas', ',', 'but', 'they', 'have', 'weakened', 'since', 'killing', 'one', 'elderly', 'woman', 'and', 'scorching', 'dozens', 'of', 'homes', 'on', 'Tuesday', '.']
It has the labels [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [26]:
# Run this cell. The function below will take care of the case of
# sequences of unequal lengths.

def pad_sequence_(batch, padding_word=PADDING_WORD, padding_label=0):
    batch_data, batch_labels = zip(*batch)
    max_len = max(map(len, batch_labels))
    padded_data = [[b[i] if i < len(b) else padding_word for i in range(max_len)] for b in batch_data]
    padded_labels = [[l[i] if i < len(l) else padding_label for i in range(max_len)] for l in batch_labels]
    return padded_data, padded_labels


In [27]:
# This is how it works
x = [([1,2,3],[0,0,1]), ([4,5],[1,0]), ([6,7,8,9],[0,1,1,0])]
pad_sequence_(x)

([[1, 2, 3, '<PAD>'], [4, 5, '<PAD>', '<PAD>'], [6, 7, 8, 9]],
 [[0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 1, 0]])

Here is the actual classifier, as a class extending the Pytorch 'nn.Module' class. Your task is to write the forward function (look for "YOUR CODE HERE" below).

In [30]:
class NERClassifier(nn.Module):

    def __init__(self, word_embeddings,  # Pre-trained word embeddings
                 char_to_id,             # Mapping from chars to ids
                 word_to_id,             # Mapping from words to ids
                 char_emb_size=16,
                 char_hidden_size=25,    # Hidden size of the character-level biRNN
                 word_hidden_size=100,   # Hidden size of the word-level biRNN
                 padding_word=PADDING_WORD,
                 unknown_word=UNKNOWN,
                 char_bidirectional=True,
                 word_bidirectional=True,
                 device='cuda:0'
            ):

        super(NERClassifier, self).__init__()
        self.padding_word = padding_word
        self.unknown_word = unknown_word
        self.char_to_id = char_to_id
        self.word_to_id = word_to_id
        self.char_emb_size = char_emb_size
        self.char_hidden_size = char_hidden_size
        self.word_hidden_size = word_hidden_size
        self.char_bidirectional = char_bidirectional
        self.word_bidirectional = word_bidirectional
        self.device='cuda:0'

        # Create an embedding tensor for the words and import the Glove
        # embeddings. The embeddings are frozen (i.e., they will not be
        # updated during training).
        vocabulary_size = len(word_embeddings)
        self.word_emb_size = len(word_embeddings[0])

        # Pre-trained word embeddings (GloVe) are loaded into an nn.Embedding layer
        self.word_emb = nn.Embedding(vocabulary_size, self.word_emb_size)
        # weights will not be updated/learned further; requires_grad = FALSE ==> forzen!
        self.word_emb.weight = nn.Parameter(torch.tensor(embeddings, dtype=torch.float), 
                                            requires_grad=False) 
        
        # Create an embedding tensor for character embeddings. These embeddings
        # are learnt from scratch (i.e., they are not frozen).

        # bidirectional set to TRUE => two GRUs run in parallel, one to process seq from start to end, the other
        # from end to start -- enables the network to capture context from both directions.
        
        if self.char_emb_size > 0:
            self.char_emb = nn.Embedding(len(char_to_id), char_emb_size)
            # Bidirectional RNN - to process character embeddings
            self.char_birnn = nn.GRU(
                self.char_emb_size,
                self.char_hidden_size,
                bidirectional=char_bidirectional,  
                batch_first=True
            )
        else:
            self.char_hidden_size = 0

        # Output: Bidectional setting effectively doubles the output feature (2 * char_hidden_state)
        # as it concatenates the final hidden states from both directions
        multiplier = 2 if self.char_bidirectional else 1
        self.word_birnn = nn.GRU(
            self.word_emb_size + multiplier * self.char_hidden_size,  # input size
            self.word_hidden_size,
            bidirectional=word_bidirectional,
            batch_first=True
        )

        # Binary classification - 0 if not part of the name, 1 if a name
        multiplier = 2 if self.word_bidirectional else 1
        self.final_pred = nn.Linear(multiplier * self.word_hidden_size, 2)

       
    
    def forward(self, x):
        """
        Performs a forward pass of a NER classifier
        Takes as input a 2D list `x` of dimensionality (B, T),
        where B is the batch size;
              T is the max sentence length in the batch (shorter sentences
              are already padded with the special token <PAD>)
              
        Returns logits, i.e. the output of the last linear layer before applying softmax.

        :param      x:    A batch of sentences
        :type       x:    list of strings
        """

        # First find all word IDs of all words in all sentences in the batch
        # and the character IDs of all characters in all words in all sentences
        # REPLACE WITH YOUR CODE
        word_ids = [[self.word_to_id.get(word, self.word_to_id[self.unknown_word]) for word in sentence] for sentence in x]

        #  x: batch of sentences in the form of a list of list
        #  Calculate the number of sentences (Batch size)
        number_of_sentences = len(x)

        # Assuming all sentences are padded to the same length
        if number_of_sentences > 0:
            number_of_words = len(x[0])

        # Finding out the max_word_length
        max_word_length = max(len(word) for sentence in x for word in sentence if word != self.padding_word)
        
        #next we create equal char_ids
        char_lists, char_ids = [], []
        for i in range(number_of_sentences):
            sentence = x[i]
            for j in range(number_of_words):
                word = sentence[j]
                if word != self.padding_word:   # '<PAD>'
                    char_list = list(word)
                else:
                    char_list = list()
            
                # create the corresponding char_id_list
                char_id_list = [self.char_to_id[char] for char in char_list]
                while len(char_id_list) < max_word_length:
                    char_list.append('<PAD>')
                    char_id_list.append(0)
                    
                char_lists.append(char_list)
                char_ids.append(char_id_list)

        #converting list to tensor so that it can be used as an input to self.char_emb
        char_tensor = torch.LongTensor(char_ids).to(self.device)
        char_embeddings = self.char_emb(char_tensor)
        #outputs, h_fw, h_bw = self.char_birnn(char_embeddings)
        outputs, h_n = self.char_birnn(char_embeddings)

        # Since the GRU is bidirectional and batch_first=True, h_n will have 
        # shape [num_layers * num_directions, batch, hidden_size]
        # we need to separate the forward and backward hidden states.
        if self.char_birnn.bidirectional:
            h_fw = h_n[0:h_n.size(0):2]  # Take every second element starting from 0
            h_bw = h_n[1:h_n.size(0):2]  # Take every second element starting from 1
        else:
            h_fw = h_n
            h_bw = None
        concatenated_char_hidden_states = torch.cat((h_fw, h_bw), dim=2) if self.char_birnn.bidirectional else h_fw
        
        # Reshaping it back to a 3D tensor -- intention: adjust the shape of tensor to match input expected by subsequent
        # processing layers.
        # Total nubmer of sentences; number of words and inferred feature size
        concatenated_char_hidden_states = concatenated_char_hidden_states.reshape(number_of_sentences, number_of_words, -1)
        
        # The 'to(self.device)' below is necessary for making sure that 
        # the model and the data are on the same device (CPU or CUDA).
        word_tensor = torch.tensor(word_ids).to(self.device)
        word_embeddings = self.word_emb(word_tensor)
        
        # YOUR CODE HERE
        # Contenate GloVe vectors with character-level word vectors
        concatenated_word_char = torch.cat((word_embeddings, concatenated_char_hidden_states), dim=-1)
        # Here, we use our implementation of the bidirectional GRU in GRU.py to get 
        # the outputs of the word-level BiRNN
        #outputs, h_fw, h_bw = self.word_birnn(concatenated_word_char)
        outputs, h_n = self.word_birnn(concatenated_word_char)
        if self.word_birnn.bidirectional:
            # Split the hidden states of the last layer for forward and backward directions
            h_fw = h_n[0:h_n.size(0):2]  # Take every second element starting from 0
            h_bw = h_n[1:h_n.size(0):2]  # Take every second element starting from 1
        else:
            h_fw = h_n  #only one direction, forward
            h_bw = None
        
        # REPLACE THE STATEMENT BELOW WITH YOUR RETURN STATEMENT
        #return torch.zeros((len(x), len(x[0]), 2), requires_grad=True).to(self.device)
        return self.final_pred(outputs)


In [31]:
# ================== Hyper-parameters ==================== #

learning_rate = 0.001
epochs = 5

# ======================= Training ======================= #

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

dim, word_to_id, embeddings = load_glove_embeddings('/datasets/dd2417/glove.6B.50d.txt')
training_set = NERDataset('/datasets/dd2417/ner_training.csv', word_to_id)
training_loader = DataLoader(training_set, batch_size=128, collate_fn=pad_sequence_)

ner = NERClassifier(embeddings, char_to_id, word_to_id, device=device).to(device)

optimizer = optim.Adam(ner.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

ner.train()
for epoch in range(epochs):   
    for x, y in tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
        optimizer.zero_grad()
        logits = ner(x)
            
        loss = criterion(logits.reshape(-1, logits.shape[2]), torch.tensor(y).to(device).reshape(-1,))
        loss.backward()
        
        clip_grad_norm_(ner.parameters(), 5)
        optimizer.step()


Running on cuda:0
D is 50


Epoch 1: 100%|██████████| 340/340 [00:15<00:00, 22.43it/s]
Epoch 2: 100%|██████████| 340/340 [00:14<00:00, 23.96it/s]
Epoch 3: 100%|██████████| 340/340 [00:14<00:00, 23.55it/s]
Epoch 4: 100%|██████████| 340/340 [00:13<00:00, 24.50it/s]
Epoch 5: 100%|██████████| 340/340 [00:14<00:00, 24.07it/s]


In [32]:
# Evaluation
import numpy as np
!pip install terminaltables
from terminaltables import AsciiTable

ner.eval()
confusion_matrix = [[0, 0],
                    [0, 0]]
test_set = NERDataset('/datasets/dd2417/ner_test.csv', word_to_id)
for x, y in test_set:
    pred = torch.argmax(ner([x]), dim=-1).detach().cpu().numpy().reshape(-1,)
    y = np.array(y)
    tp = np.sum(pred[y == 1])
    tn = np.sum(1 - pred[y == 0])
    fp = np.sum(1 - y[pred == 1])
    fn = np.sum(y[pred == 0])

    confusion_matrix[0][0] += tn
    confusion_matrix[1][1] += tp
        
    confusion_matrix[0][1] += fp
    confusion_matrix[1][0] += fn

    
table = [['', 'Predicted no name', 'Predicted name'],
             ['Real no name', confusion_matrix[0][0], confusion_matrix[0][1]],
             ['Real name', confusion_matrix[1][0], confusion_matrix[1][1]]]

t = AsciiTable(table)
print(t.table)
print("Accuracy: {}".format(
    round((confusion_matrix[0][0] + confusion_matrix[1][1]) / np.sum(confusion_matrix), 4))
)



Collecting terminaltables
  Using cached terminaltables-3.1.10-py2.py3-none-any.whl.metadata (3.5 kB)
Using cached terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)
Installing collected packages: terminaltables
Successfully installed terminaltables-3.1.10
+--------------+-------------------+----------------+
|              | Predicted no name | Predicted name |
+--------------+-------------------+----------------+
| Real no name | 84191             | 513            |
| Real name    | 1646              | 13629          |
+--------------+-------------------+----------------+
Accuracy: 0.9784
