### DataSet

Let’s use CoNLL 2002 data to build a NER system

CoNLL2002 corpus is available in NLTK. 

In [1]:
# download corpus

import nltk
nltk.download('conll2002')

# get training/testing datasets
from nltk.corpus import conll2002

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


### Data Preparation

In [2]:
## Training and testing

train_sents = list(conll2002.iob_sents('esp.train')) ## spain
test_sents = list(conll2002.iob_sents('esp.testb'))

print(train_sents[0])
#each tuple contains token, syntactic tag, ner label


[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


In [3]:
# functions of sentence representations for sequence labelling
def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [4]:
# sentence representations for sequence labelling
train_sent_tokens = [sent2tokens(s) for s in train_sents]
train_labels = [sent2labels(s) for s in train_sents]

train_id_2_label = list(set([label for sent in train_labels for label in sent]))
train_label_2_id = {label:i for i, label in enumerate(train_id_2_label)}
print("Number of unique labels in training data:", len(train_id_2_label))

def convert_labels_to_inds(sent_labels, label_2_id):
    return [label_2_id[label] for label in sent_labels]

train_label_inds = [convert_labels_to_inds(sent_labels, train_label_2_id) for sent_labels in train_labels]

test_sent_tokens = [sent2tokens(s) for s in test_sents]
test_labels = [sent2labels(s) for s in test_sents]
test_label_inds = [convert_labels_to_inds(s, train_label_2_id) for s in test_labels]

Number of unique labels in training data: 9


In [5]:
window_size = 2

# converting tokenized sentence lists to vocabulary indices
id_2_word = list(set([token for sent in train_sent_tokens for token in sent])) + ["<pad>", "<unk>"]
word_2_id = {w:i for i,w in enumerate(id_2_word)}

def convert_tokens_to_inds(sentence, word_2_id):
    return [word_2_id.get(t, word_2_id["<unk>"]) for t in sentence]

# padding for windows
def pad_sentence_for_window(sentence, window_size, pad_token="<pad>"):
    return [pad_token]*window_size + sentence + [pad_token]*window_size 


In [6]:
import pprint
pp = pprint.PrettyPrinter()

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial

In [7]:
# Batching sentences together with a DataLoader

def my_collate(data, window_size, word_2_id):
    """
    For some chunk of sentences and labels
        -add winow padding
        -pad for lengths using pad_sequence
        -convert our labels to one-hots
        -return padded inputs, one-hot labels, and lengths
    """
    
    x_s, y_s = zip(*data)

    # deal with input sentences as we've seen
    window_padded = [convert_tokens_to_inds(pad_sentence_for_window(sentence, window_size), word_2_id)
                                                                                  for sentence in x_s]
    # append zeros to each list of token ids in batch so that they are all the same length
    padded = nn.utils.rnn.pad_sequence([torch.LongTensor(t) for t in window_padded], batch_first=True)
    
    # convert labels to one-hots
    labels = []
    lengths = []
    for y in y_s:
        lengths.append(len(y))
        one_hot = torch.zeros(len(y), len(train_id_2_label))
        y = torch.tensor(y)
        y = y.unsqueeze(1)
        label = one_hot.scatter_(1, y, 1)
        labels.append(label)
    padded_labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)
    
    return padded.long(), padded_labels, torch.LongTensor(lengths)

In [8]:
batch_size = 4

# Shuffle True is good practice for train loaders.
# Use functools.partial to construct a partially populated collate function
train_loader = DataLoader(list(zip(train_sent_tokens, train_label_inds)), 
                            batch_size=batch_size, shuffle=True, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [9]:
for batched_input, batched_labels, batch_lengths in train_loader:
    pp.pprint(("inputs", batched_input, batched_input.size()))
    pp.pprint(("labels", batched_labels, batched_labels.size()))
    pp.pprint(batch_lengths)
    break

('inputs',
 tensor([[26099, 26099, 21510,  9396, 19190,  6131,  1726, 10515, 23325, 16482,
          7458, 11492,  1130,  7080, 14280, 26099, 26099,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [26099, 26099, 13589,  8865,  6131, 22755, 23325, 15181, 24899, 19625,
          6131, 11056, 16979,  7657, 23065,  8144,  1478,  5174, 15093, 12142,
         11813, 13143, 23325, 24555,  7657, 16337,  8546, 23325, 16819, 14280,
         26099, 26099],
        [26099, 26099, 18705, 23325, 15181, 22565,   523,  5630,  2943,  1108,
         21510,  5480,  6131, 10654,  7458, 12768,  5660,  3385, 15676, 21510,
         18772,  7458, 13671,  6639,  6131, 23355, 14280, 26099, 26099,     0,
             0,     0],
        [26099, 26099, 17158,  9579, 17634, 15574, 12794,  6131,  2218, 12667,
         21510, 17158,  7458, 14280, 26099, 26099,     0,     0,     0,     0,
             0,     0,     0,     0,     0,    

In [10]:
class SoftmaxWordWindowClassifier(nn.Module):
    """
    A one-layer, binary word-window classifier.
    """
    def __init__(self, config, vocab_size, pad_idx=0):
        super(SoftmaxWordWindowClassifier, self).__init__()
        """
        Instance variables.
        """
        self.window_size = 2*config["half_window"]+1
        self.embed_dim = config["embed_dim"]
        self.hidden_dim = config["hidden_dim"]
        self.num_classes = config["num_classes"]
        self.freeze_embeddings = config["freeze_embeddings"]
        
        """
        Embedding layer
        -model holds an embedding for each layer in our vocab
        -sets aside a special index in the embedding matrix for padding vector (of zeros)
        -by default, embeddings are parameters (so gradients pass through them)
        """
        self.embed_layer = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_idx)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False
        
        """
        Hidden layer
        -we want to map embedded word windows of dim (window_size+1)*self.embed_dim to a hidden layer.
        -nn.Sequential allows you to efficiently specify sequentially structured models
            -first the linear transformation is evoked on the embedded word windows
            -next the nonlinear transformation tanh is evoked.
        """
        self.hidden_layer = nn.Sequential(nn.Linear(self.window_size*self.embed_dim, 
                                                    self.hidden_dim), 
                                          nn.Tanh())
        
        """
        Output layer
        -we want to map elements of the output layer (of size self.hidden dim) to a number of classes.
        """
        self.output_layer = nn.Linear(self.hidden_dim, self.num_classes)
        
        """
        Softmax
        -The final step of the softmax classifier: mapping final hidden layer to class scores.
        -pytorch has both logsoftmax and softmax functions (and many others)
        -since our loss is the negative LOG likelihood, we use logsoftmax
        -technically you can take the softmax, and take the log but PyTorch's implementation
         is optimized to avoid numerical underflow issues.
        """
        self.log_softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim
            
        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()
        
        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, self.window_size, 1)
        _, adjusted_length, _ = token_windows.size()
        
        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, self.window_size)
        
        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S) 
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embed_layer(token_windows)
        
        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)
        
        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)
        
        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 2) FloatTensor.
        """
        output = self.output_layer(layer_1)
        
        """
        Softmax.
        Takes in a (B, L~, 2) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 2) FloatTensor of (log-)normalized class scores.
        """
        output = self.log_softmax(output)
        
        return output

In [11]:
def loss_function(outputs, labels, lengths):
    """Computes negative LL loss on a batch of model predictions."""
    B, L, num_classes = outputs.size()
    num_elems = lengths.sum().float()
        
    # get only the values with non-zero labels
    loss = outputs*labels
    
    # rescale average
    return -loss.sum() / num_elems

In [12]:
def train_epoch(loss_function, optimizer, model, train_data):
    
    ## For each batch, we must reset the gradients
    ## stored by the model.   
    total_loss = 0
    for batch, labels, lengths in train_data:
        # clear gradients
        optimizer.zero_grad()
        # evoke model in training mode on batch
        outputs = model.forward(batch)
        # compute loss w.r.t batch
        loss = loss_function(outputs, labels, lengths)
        # pass gradients back, startiing on loss value
        loss.backward()
        # update parameters
        optimizer.step()
        total_loss += loss.item()
    
    # return the total to keep track of how you did this time around
    return total_loss

In [13]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 9,
          "freeze_embeddings": False,
         }
learning_rate = .002
num_epochs = 10
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [14]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)

0 2140.4898500442505
1 1289.6083977930248
2 1206.9354143291712
3 1173.0104809589684
4 1153.8970264196396
5 1139.9451247304678
6 1127.7192901335657
7 1116.7456638664007
8 1109.0408693887293
9 1098.701995499432


### Evaluation
There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [15]:
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [16]:
test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())


In [17]:
y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


In [18]:
# evaluate CRF model
!pip install sklearn-crfsuite
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=train_id_2_label)



0.8346464921402771

In [24]:
y_pred[3:100]

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O'],
 ['O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O

### Inspect per-class results in more detail:

In [20]:
# group B and I results
sorted_labels = sorted(
    train_id_2_label,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O      0.885     1.000     0.939     45355
       B-LOC      0.866     0.054     0.101      1084
       I-LOC      0.000     0.000     0.000       325
      B-MISC      0.000     0.000     0.000       339
      I-MISC      0.000     0.000     0.000       557
       B-ORG      0.848     0.127     0.221      1400
       I-ORG      0.000     0.000     0.000      1104
       B-PER      0.000     0.000     0.000       735
       I-PER      0.667     0.016     0.031       634

    accuracy                          0.885     51533
   macro avg      0.363     0.133     0.143     51533
weighted avg      0.828     0.885     0.835     51533

