### DataSet

Let’s use CoNLL 2002 data to build a NER system

CoNLL2002 corpus is available in NLTK. 

In [1]:
# download corpus

import nltk
nltk.download('conll2002')

# get training/testing datasets
from nltk.corpus import conll2002

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


### Data Preparation

In [2]:
## Training and testing

train_sents = list(conll2002.iob_sents('esp.train')) ## spain
test_sents = list(conll2002.iob_sents('esp.testb'))

print(train_sents[0])
#each tuple contains token, syntactic tag, ner label


[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


In [212]:
    def read_file(file_path):
        fileobj = open(file_path, 'r', encoding='utf-8')
        samples = []
        tokens = []
        tags = []

        for content in fileobj:
 
            content = content.strip('\n')
            
            if content == '-DOCSTART- -X- -X- O':
                pass
            elif content == ''or content == '\t':
                if len(tokens) != 0:
                    samples.append((tokens, tags))
                    tokens = []
                    tags = []
            else:
                contents = content.split('\t')
                tokens.append(contents[0])
                tags.append(contents[-1])
        return samples

In [213]:
path = '.\emerging_entities_17-master\emerging_entities_17-master\wnut17train.conll'
train = read_file(path)
#train[4]
path2 = '.\emerging_entities_17-master\emerging_entities_17-master\emerging.test.conll'
test = read_file(path2)

In [214]:
length = len(test)
for i in range(length):
    
    l = len(test[i][1])
    for j in range(l):
        la = test[i][1][j].split(',')[0]
        test[i][1][j]=la
test[7]

(['(',
  'Source',
  ':',
  'ANI',
  ')',
  'Visuals',
  'of',
  'the',
  'avalanche',
  'site',
  'in',
  'Gurez',
  'sector',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-location',
  'I-location',
  'O',
  'B-corporation',
  'I-location',
  'O'])

In [215]:
def sent2labels2(sent):
    return sent[1]

In [216]:
def sent2tokens2(sent):
    return sent[0]

In [217]:
X_train2 = [sent2tokens2(s) for s in train]
y_train2 = [sent2labels2(s) for s in train]

X_test2 = [sent2tokens2(s) for s in test]
y_test2 = [sent2labels2(s) for s in test]

In [273]:
len(X_train2)

3394

In [219]:
# functions of sentence representations for sequence labelling
def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [274]:
# sentence representations for sequence labelling
#train_sent_tokens = [sent2tokens(s) for s in train_sents]
#train_labels = [sent2labels(s) for s in train_sents]

train_sent_tokens=X_train2

train_labels = y_train2
train_labels = [i for i in train_labels  if i != '']

train_id_2_label = list(set([label for sent in train_labels for label in sent]))
#train_id_2_label = [i for i in train_id_2_label  if i != '']
train_label_2_id = {label:i for i, label in enumerate(train_id_2_label)}
print("Number of unique labels in training data:", len(train_id_2_label))

def convert_labels_to_inds(sent_labels, label_2_id):
    return [label_2_id[label] for label in sent_labels]

train_label_inds = [convert_labels_to_inds(sent_labels, train_label_2_id) for sent_labels in train_labels]

#test_sent_tokens = [sent2tokens(s) for s in test_sents]
#test_labels = [sent2labels(s) for s in test_sents]

test_sent_tokens = X_test2
test_labels =y_test2
test_label_inds = [convert_labels_to_inds(s, train_label_2_id) for s in test_labels]
test_id_2_label = list(set([label for sent in test_labels for label in sent]))


Number of unique labels in training data: 13


In [275]:
test_sent_tokens[3]
test_labels[5]
print(train_id_2_label)
print(test_id_2_label)

['B-location', 'B-product', 'O', 'I-corporation', 'I-creative-work', 'I-person', 'B-person', 'B-corporation', 'I-group', 'I-product', 'B-creative-work', 'B-group', 'I-location']
['B-location', 'B-product', 'O', 'I-corporation', 'I-creative-work', 'I-person', 'B-person', 'B-corporation', 'I-group', 'I-product', 'B-creative-work', 'B-group', 'I-location']


In [276]:
train_id_2_label
print(train_labels[888])
print(train_sent_tokens[888])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['@Dishfunctional', 'yeah', '.', 'big', 'time', 'dancer', '.', 'i', 'might', 'worship', 'her', 'a', 'bit']


In [277]:
window_size = 2

# converting tokenized sentence lists to vocabulary indices
id_2_word = list(set([token for sent in train_sent_tokens for token in sent])) + ["<pad>", "<unk>"]
word_2_id = {w:i for i,w in enumerate(id_2_word)}

def convert_tokens_to_inds(sentence, word_2_id):
    return [word_2_id.get(t, word_2_id["<unk>"]) for t in sentence]

# padding for windows
def pad_sentence_for_window(sentence, window_size, pad_token="<pad>"):
    return [pad_token]*window_size + sentence + [pad_token]*window_size 


In [278]:
import pprint
pp = pprint.PrettyPrinter()

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial

In [279]:
# Batching sentences together with a DataLoader

def my_collate(data, window_size, word_2_id):
    """
    For some chunk of sentences and labels
        -add winow padding
        -pad for lengths using pad_sequence
        -convert our labels to one-hots
        -return padded inputs, one-hot labels, and lengths
    """
    
    x_s, y_s = zip(*data)

    # deal with input sentences as we've seen
    window_padded = [convert_tokens_to_inds(pad_sentence_for_window(sentence, window_size), word_2_id)
                                                                                  for sentence in x_s]
    # append zeros to each list of token ids in batch so that they are all the same length
    padded = nn.utils.rnn.pad_sequence([torch.LongTensor(t) for t in window_padded], batch_first=True)
    
    # convert labels to one-hots
    labels = []
    lengths = []
    for y in y_s:
        lengths.append(len(y))
        one_hot = torch.zeros(len(y), len(train_id_2_label))
        y = torch.tensor(y)
        y = y.unsqueeze(1)
        label = one_hot.scatter_(1, y, 1)
        labels.append(label)
    padded_labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)
    
    return padded.long(), padded_labels, torch.LongTensor(lengths)

In [280]:
train_sent_tokens[4]

['4Dbling',
 "'s",
 'place',
 'til',
 'monday',
 ',',
 'party',
 'party',
 'party',
 '.',
 '&lt;',
 '3']

In [281]:
train_label_inds[4]

[6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

In [282]:
batch_size = 4

# Shuffle True is good practice for train loaders.
# Use functools.partial to construct a partially populated collate function
train_loader = DataLoader(list(zip(train_sent_tokens, train_label_inds)), 
                            batch_size=batch_size, shuffle=True, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [283]:
for batched_input, batched_labels, batch_lengths in train_loader:
    pp.pprint(("inputs", batched_input, batched_input.size()))
    pp.pprint(("labels", batched_labels, batched_labels.size()))
    pp.pprint(batch_lengths)
    break

('inputs',
 tensor([[14878, 14878,  2668, 14376, 14705, 13386,  7544,  9014,  6449,  6484,
          4977, 11696,    79,  7544, 14878, 14878,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [14878, 14878, 11810, 12648,  6158,  9091, 11756,  3097,  8828, 11840,
          2828,  3663,  3340,  4773,  5705,  8671,  2584,  8653,  1288,  8934,
          8653, 10967,   323,  5320, 14878, 14878,     0,     0],
        [14878, 14878,  6061,  7032,  6158,  3462, 13941,  7314, 11974, 12644,
          6765,  8920, 13653, 10504, 11022,  2016,  8653,  5073,  4149,  7544,
          6463,  2913,  7544,  1558,  7859,   473, 14878, 14878],
        [14878, 14878,  7314,  6792, 12644,  8097, 12302,  5253,  7544, 10746,
          9214, 14705, 11556, 13386,  7544, 14878, 14878,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]]),
 torch.Size([4, 28]))
('labels',
 tensor([[[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0

In [284]:
class SoftmaxWordWindowClassifier(nn.Module):
    """
    A one-layer, binary word-window classifier.
    """
    def __init__(self, config, vocab_size, pad_idx=0):
        super(SoftmaxWordWindowClassifier, self).__init__()
        """
        Instance variables.
        """
        self.window_size = 2*config["half_window"]+1
        self.embed_dim = config["embed_dim"]
        self.hidden_dim = config["hidden_dim"]
        self.num_classes = config["num_classes"]
        self.freeze_embeddings = config["freeze_embeddings"]
        
        """
        Embedding layer
        -model holds an embedding for each layer in our vocab
        -sets aside a special index in the embedding matrix for padding vector (of zeros)
        -by default, embeddings are parameters (so gradients pass through them)
        """
        self.embed_layer = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_idx)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False
        
        """
        Hidden layer
        -we want to map embedded word windows of dim (window_size+1)*self.embed_dim to a hidden layer.
        -nn.Sequential allows you to efficiently specify sequentially structured models
            -first the linear transformation is evoked on the embedded word windows
            -next the nonlinear transformation tanh is evoked.
        """
        self.hidden_layer = nn.Sequential(nn.Linear(self.window_size*self.embed_dim, 
                                                    self.hidden_dim), 
                                          nn.Tanh())
        
        """
        Output layer
        -we want to map elements of the output layer (of size self.hidden dim) to a number of classes.
        """
        self.output_layer = nn.Linear(self.hidden_dim, self.num_classes)
        
        """
        Softmax
        -The final step of the softmax classifier: mapping final hidden layer to class scores.
        -pytorch has both logsoftmax and softmax functions (and many others)
        -since our loss is the negative LOG likelihood, we use logsoftmax
        -technically you can take the softmax, and take the log but PyTorch's implementation
         is optimized to avoid numerical underflow issues.
        """
        self.log_softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim
            
        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()
        
        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, self.window_size, 1)
        _, adjusted_length, _ = token_windows.size()
        
        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, self.window_size)
        
        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S) 
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embed_layer(token_windows)
        
        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)
        
        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)
        
        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 2) FloatTensor.
        """
        output = self.output_layer(layer_1)
        
        """
        Softmax.
        Takes in a (B, L~, 2) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 2) FloatTensor of (log-)normalized class scores.
        """
        output = self.log_softmax(output)
        
        return output

In [285]:
def loss_function(outputs, labels, lengths):
    """Computes negative LL loss on a batch of model predictions."""
    B, L, num_classes = outputs.size()
    num_elems = lengths.sum().float()
        
    # get only the values with non-zero labels
    loss = outputs*labels
    
    # rescale average
    return -loss.sum() / num_elems

In [286]:
def train_epoch(loss_function, optimizer, model, train_data):
    
    ## For each batch, we must reset the gradients
    ## stored by the model.   
    total_loss = 0
    for batch, labels, lengths in train_data:
        # clear gradients
        optimizer.zero_grad()
        # evoke model in training mode on batch
        outputs = model.forward(batch)
        # compute loss w.r.t batch
        loss = loss_function(outputs, labels, lengths)
        # pass gradients back, startiing on loss value
        loss.backward()
        # update parameters
        optimizer.step()
        total_loss += loss.item()
    
    # return the total to keep track of how you did this time around
    return total_loss

In [311]:
config = {"batch_size": 4,
          "half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "num_classes": 13,
          "freeze_embeddings": False,
         }
learning_rate = .0005
num_epochs = 3
model = SoftmaxWordWindowClassifier(config, len(word_2_id))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [312]:
for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, train_loader)
    print(epoch, epoch_loss)

0 1991.5238295793533
1 1567.9127234220505
2 1189.688752412796


### Evaluation
There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [313]:
test_loader = DataLoader(list(zip(test_sent_tokens, test_label_inds)), 
                            batch_size=batch_size, shuffle=False, 
                            collate_fn=partial(my_collate, window_size=2, word_2_id=word_2_id))

In [314]:
test_outputs = []
for test_instance, labs, _ in test_loader:
    outputs_full = model.forward(test_instance)
    outputs = torch.argmax(outputs_full, dim=2)
    for i in range(outputs.size(0)):
        test_outputs.append(outputs[i].tolist())


In [315]:
y_test = test_labels
y_pred = []
for test, pred in zip(test_labels, test_outputs):
    y_pred.append([train_id_2_label[id] for id in pred[:len(test)]])

assert len(y_pred) == len(y_test), '{} vs. {}'.format(len(y_pred), len(y_test))
for i, pred, test in zip(list(range(len(y_pred))), y_pred, y_test):
    assert len(pred) == len(test), '{}: {} vs. {}'.format(i, len(pred), len(test))


In [316]:
# evaluate CRF model
!pip install sklearn-crfsuite
from sklearn_crfsuite import metrics

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=test_id_2_label)



0.8996352455467005

In [317]:
print(y_pred[223:1582])


[['O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-creative-work', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'I-creative-work', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-creative-work', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-person', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-group', 'B-product', 'O', 'I-product', 'O', '

### Inspect per-class results in more detail:

In [318]:
# group B and I results
sorted_labels = sorted(
    test_id_2_label,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

              O      0.939     0.980     0.959     21934
  B-corporation      0.000     0.000     0.000       117
  I-corporation      0.000     0.000     0.000        24
B-creative-work      0.000     0.000     0.000       231
I-creative-work      0.022     0.004     0.007       237
        B-group      0.018     0.010     0.013        97
        I-group      0.000     0.000     0.000        39
     B-location      0.000     0.000     0.000       122
     I-location      0.000     0.000     0.000        39
       B-person      0.035     0.008     0.014       355
       I-person      0.000     0.000     0.000       101
      B-product      0.000     0.000     0.000        56
      I-product      0.000     0.000     0.000        42

       accuracy                          0.919     23394
      macro avg      0.078     0.077     0.076     23394
   weighted avg      0.881     0.919     0.900     23394

