Getting everything set up....

In [0]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

In [4]:
!pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=753a9f317dd123dc459811fa2416fdba2ac22fc71d836952bb308e9b625bc9bb
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [0]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score


In [0]:
torch.cuda.current_device()
torch.cuda.device(0)
torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [0]:
from typing import List, Any, Dict, Generator

Using my helper functions

In [0]:
from torch.nn.utils.rnn import pad_sequence
import torch
from typing import List, Any, Dict, Generator


def parse_file(filename: str) -> (List[List[str]], List[List[str]]):
    """
    Function for parsing a file in the CONLL format.

    :param filename: path to text file. needs to be in CONLL format.
    :return: list of lists of tokens, list of lists of tags.
    """
    tokens = []
    tags = []

    with open(filename) as f:
        sentence = []
        sentence_tags = []
        for line in f:
            line_split = line.split()
            if len(line_split) == 0:
                if len(sentence) > 1:
                  tokens.append(sentence)
                  tags.append(sentence_tags)
                  sentence = []
                  sentence_tags = []
            else:
                if line_split[0] != "-DOCSTART-":
                    sentence.append(line_split[0])
                    sentence_tags.append(line_split[-1])
                else:
                    continue

    return tokens[1:], tags[1:]


def flatten(list_of_lists: List[List[Any]], unique: bool = False) -> List[Any]:
    """
    Flattens and can reduce a list of lists to unique values

    :param list_of_lists: list of lists to flatten
    :param unique: specifying whether to return a unique or non-unique list
    :return: either a unique or non-unique list
    """
    flattened = [item for sublist in list_of_lists for item in sublist]
    if unique:
        return list(set(flattened))
    else:
        return flattened


def index_dicts(list_to_index: List[List[Any]], special_tokens: List = None) -> Dict[int, str]:
    """
    Creates the index of tokens or tags to be used in model

    :param list_to_index: list to index
    :param special_tokens: special tokens, if any, for index
    :return: dictionary with indices
    """
    word2idx = {}
    flat_list = flatten(list_to_index, unique=True)
    if special_tokens:
        flat_list = [i for i in flat_list if i not in special_tokens]
        flat_list = special_tokens + flat_list

    for v, k in enumerate(flat_list):
        if k not in word2idx.keys():
          word2idx[k] = v

    return word2idx


def prepare_sentence(sentence: List[str], index: Dict[str, int]):
    """

    :param sentence:
    :param index:
    :return:
    """
    idxs = [index[w] if w in index.keys() else index["<UNK>"] for w in sentence]
    return torch.tensor(idxs, dtype=torch.long)


def convert_to_index(sequences: [List[List[Any]]], index: Dict[str, int]) -> List[List[int]]:
    """

    :param sequences:
    :param index:
    :return:
    """
    converted = []
    for sentence in sequences:
        converted.append(prepare_sentence(sentence, index))
    return converted


def generate_batches(batch_size: int, tokens: List[List[int]], tags: List[List[int]]) -> (Generator, Generator):
    """

    :param batch_size:
    :param tokens:
    :param tags:
    :return:
    """

    for i in range(0, len(tokens), batch_size):
        token_chunk = tokens[i:i + batch_size]
        tag_chunk = tags[i:i + batch_size]

        tokens_padded = pad_sequence([word for word in token_chunk], batch_first=True)
        tags_padded = pad_sequence([tag for tag in tag_chunk], batch_first=True)
        yield tokens_padded, tags_padded


def word_to_index(word):
    return word2idx_train[word]


# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def sentence_to_tensor(sentence, n_labels, unpadded=False):
    tensor = torch.zeros(len(sentence), dtype=torch.long)
    if unpadded:
        #sentence should be a list of words
        for num, word in enumerate(sentence):
            if word not in word2idx_train.keys():
                tensor[num] = word2idx_train["<UNK>"]
            else:
                tensor[num] = word2idx_train[word]
    else:
        for num, word in enumerate(sentence):
            tensor[num] = word
    return tensor


def generate_batches_by_len(tokens, tags):
    tokens_len = [(len(s), s) for s in tokens]
    tags_len = [(len(t), t) for t in tags]

    min_len = tokens_len[0][0]
    max_len = tokens_len[-1][0]

    for i in range(min_len, max_len + 1):
        tokens_to_return = [s[1] for s in tokens_len if s[0] == i]
        tags_to_return = [s[1] for s in tags_len if s[0] == i]

        tokens_tensor = torch.zeros([len(tokens_to_return), i], dtype=torch.long)
        tags_tensor = torch.zeros([len(tags_to_return), i], dtype=torch.long)

        for s in range(len(tokens_to_return)):
            tokens_tensor[s] = tokens_to_return[s]
            tags_tensor[s] = tags_to_return[s]

        yield tokens_tensor, tags_tensor


def predict(X, model, index):
    predictions = []
    for line in X:
        output = model(line)
        output_arg_max = [torch.argmax(i).item() for i in output]
        sentence = [index[tag] for tag in output_arg_max]
        predictions.append(sentence)
    return predictions

In [0]:
!ls

sample_data


Read in and format data.

In [0]:
train_tokens, train_tags = parse_file("train.txt")
valid_tokens, valid_tags = parse_file("valid.txt")
test_tokens, test_tags = parse_file("test.txt")

In [0]:
train_tokens_sorted = sorted(train_tokens, key=len)
train_tags_sorted = sorted(train_tags, key=len)

In [0]:
special_tokens = ["<PAD>", "<UNK>"]
word2idx_train = index_dicts(train_tokens, special_tokens=special_tokens)
tag2idx_train = index_dicts(train_tags, special_tokens=["<PAD>"])

In [0]:
tag2idx_train

{'<PAD>': 0,
 'B-LOC': 8,
 'B-MISC': 3,
 'B-ORG': 6,
 'B-PER': 7,
 'I-LOC': 1,
 'I-MISC': 4,
 'I-ORG': 2,
 'I-PER': 5,
 'O': 9}

In [0]:
train_tokens_indexed = convert_to_index(train_tokens_sorted, word2idx_train)
train_tags_indexed = convert_to_index(train_tags_sorted, tag2idx_train)

Create the model. I decided to use a bi-directional LSTM as it is a standard choice for this sort of entity recognition. My model has an embedding layer, the recurrent neural network layer, and a linear hidden state layer. It's fairly simple and doesn't have a ton of parts.

In [0]:
import torch.nn as nn

class BidirectionalLSTM(nn.Module):

    def __init__(self, 
                 vocab_size, 
                 tag_size, 
                 embedding_dim, 
                 n_hidden, 
                 batch_size, 
                 dropout_p):
        """

        :param vocab_size:
        :param embedding_dim:
        :param n_hidden:
        """
        super(BidirectionalLSTM, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=embedding_dim) #,
                                      #padding_idx=pad_idx)

        self.rnn = nn.LSTM(input_size=embedding_dim,
                           hidden_size=n_hidden,
                           num_layers=2,
                           batch_first=True,
                           bidirectional=True)
        self.n_hidden = n_hidden
        self.batch_size = batch_size
        self.n_labels = tag_size
        self.hidden2label = nn.Linear(n_hidden * 2, tag_size)
        
        
        self.dropout = nn.Dropout(dropout_p)
        self.hidden = self.init_hidden()
        

    def init_hidden(self):
        # first is the hidden h
        # second is the cell c
        return (Variable(torch.zeros(2, self.batch_size, self.n_hidden)),
                Variable(torch.zeros(2, self.batch_size, self.n_hidden)))


    def forward(self, sentence):
        embeds = self.dropout(self.embedding(sentence))
        output, (hidden, cell) = self.rnn(embeds.view(len(sentence), 1, -1))
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.hidden2label(hidden)



Here I'm setting up the parameters. I played around with these a little to get my current output. To calculate loss, I'm using cross entropy loss, and ignoring index 0 to calculate the loss, because index 0 is the PAD token I use for padding batches to the same length. For optimization I'm using Adam.

In [0]:
labels = torch.tensor([i for i in tag2idx_train.values()], dtype=torch.long)

batch_size = 32
vocab_size = len(word2idx_train)
tag_size = len(labels)
embedding_dim = 128
n_hidden = 256
dropout = 0.5
#pad_idx = 0
learning_rate = 0.001

bilstm = BidirectionalLSTM(vocab_size, tag_size, embedding_dim, n_hidden, batch_size, dropout)
criterion = nn.CrossEntropyLoss(ignore_index=0)

optimizer_bi = optim.Adam(bilstm.parameters(), lr=learning_rate)

idx2tag_train = {}
for k, v in tag2idx_train.items():
    idx2tag_train[v] = k

Here is my training loop. I didn't try to train it for a high number of epochs. You can see the epoch output below. If I had had more time, I would want to really try to optimize for my F1 score. I don't necessarily care about accuracy here, since 83% of my tags are O. What I want is a good balance of precision and recall on the tags.

In [0]:
bilstm.train()
for epoch in range(7):
    print("Epoch", epoch + 1)
    running_loss = 0.0
    batch_index = 0

    n = 0
    for batch_x, batch_y in generate_batches(batch_size, train_tokens_indexed, train_tags_indexed):

        # collect outputs for loss calc
        out_tensor = torch.zeros([batch_x.shape[0], batch_x.shape[-1], tag_size])

        optimizer_bi.zero_grad()
        for i in range(len(batch_x)):
            output = bilstm(batch_x[i])
            
            out_tensor[i] = output
            output_arg_max = [torch.argmax(i).item() for i in output]

        loss = criterion(out_tensor.permute(0, 2, 1), batch_y)

        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss)

        loss.backward()

        optimizer_bi.step()
        n += 1
    
    y_pred =  predict(train_tokens_indexed, bilstm, idx2tag_train)
    print(f"Epoch {epoch + 1} Loss:", running_loss)
    print(f"Epoch {epoch + 1} F1:", f1_score(train_tags, y_pred))
    print(f"Epoch {epoch + 1} Acc:", accuracy_score(train_tags, y_pred))

Epoch 1
Epoch 1 Loss: 0.6271941661834717
Epoch 1 F1: 0.00573532261189692
Epoch 1 Acc: 0.8238232289497466
Epoch 2
Epoch 2 Loss: 0.5823267102241516
Epoch 2 F1: 0.01026282030196375
Epoch 2 Acc: 0.805130446740796
Epoch 3
Epoch 3 Loss: 0.5258107781410217
Epoch 3 F1: 0.014123163131110986
Epoch 3 Acc: 0.7860496640603513
Epoch 4
Epoch 4 Loss: 0.4870966970920563
Epoch 4 F1: 0.016008760494342182
Epoch 4 Acc: 0.7719834191190916
Epoch 5
Epoch 5 Loss: 0.4852295219898224
Epoch 5 F1: 0.015240443896424169
Epoch 5 Acc: 0.7625633570390161
Epoch 6
Epoch 6 Loss: 0.44016608595848083
Epoch 6 F1: 0.016926440510614288
Epoch 6 Acc: 0.7538505363247023
Epoch 7
Epoch 7 Loss: 0.4210341274738312
Epoch 7 F1: 0.017009502642142744
Epoch 7 Acc: 0.7465374641467919


Now that training is done, let's evaluate on the validation set. The model performs pretty well! The best-looking tag is "LOC", which is great since this "project" is for a travel broker. PER is the worst performing tag, which intuitively makes sense, since names are extremely varied.

In [0]:
valid_tokens_indexed = convert_to_index(valid_tokens, word2idx_train)
valid_tags_indexed = convert_to_index(valid_tags, tag2idx_train)

bilstm.eval()

with torch.no_grad():
    y_pred = predict(valid_tokens_indexed, bilstm, idx2tag_train)

print("F1 score:", f1_score(valid_tags, y_pred))
print("Accuracy:", accuracy_score(valid_tags, y_pred))
print(classification_report(valid_tags, y_pred))

F1 score: 0.5068839561674628
Accuracy: 0.9096149726371551
           precision    recall  f1-score   support

      LOC       0.81      0.68      0.74      1837
      PER       0.27      0.18      0.21      1842
      ORG       0.52      0.42      0.47      1339
     MISC       0.65      0.60      0.62       922

micro avg       0.57      0.46      0.51      5940
macro avg       0.55      0.46      0.50      5940



At this point I decided to save my model for use in the Flask app.

In [0]:
torch.save(bilstm.state_dict(), "bilstm.pt")

Some helper functions for writing and reading the token and tag indices.

In [0]:
import json

def dict_to_file(index: Dict[Any, Any], filepath: str):
    with open(filepath, "w") as f:
        json.dump(index, f)

    print("Done.")

def file_to_dict(filepath: str, int_key: bool = False) -> Dict[Any, Any]:
    """
    Convert json file into dictionary.

    :param filepath: Name of file to be converted.
    :param int_key: If the dict keys are integers, return them as such.
    :return: Dictionary!
    """
    with open(filepath) as f:
        if int_key is True:
            loaded = json.load(f)
            return {int(key): val for key, val in loaded.items()}
        else:
            return json.load(f)

def load_model(path, token_index, tag_index):
    """

    :param path:
    :param token_index:
    :param tag_index:
    :return:
    """
    batch_size = 32
    vocab_size = len(token_index)
    tag_size = len(tag_index)
    embedding_dim = 128
    n_hidden = 256
    dropout = 0.5

    model = BidirectionalLSTM(vocab_size, tag_size, embedding_dim, n_hidden, batch_size, dropout)
    model.load_state_dict(torch.load(path))
    return model

In [0]:
dict_to_file(word2idx_train, "train_index.json")
dict_to_file(idx2tag_train, "tag_index.json")

Done.
Done.


Let's load it back in and make sure everything works.

In [0]:
word2idx_load = file_to_dict("train_index.json")
idx2tag_load = file_to_dict("tag_index.json", int_key=True)

reloaded_model = load_model("bilstm.pt", word2idx_load, idx2tag_load)

Everything works!

In [20]:
reloaded_model.eval()

with torch.no_grad():
    y_pred_load = predict(valid_tokens_indexed, reloaded_model, idx2tag_load)

print("F1 score:", f1_score(valid_tags, y_pred_load))
print("Accuracy:", accuracy_score(valid_tags, y_pred_load))
print(classification_report(valid_tags, y_pred_load))

F1 score: 0.5068839561674628
Accuracy: 0.9096149726371551
           precision    recall  f1-score   support

      LOC       0.81      0.68      0.74      1837
      ORG       0.52      0.42      0.47      1339
      PER       0.27      0.18      0.21      1842
     MISC       0.65      0.60      0.62       922

micro avg       0.57      0.46      0.51      5940
macro avg       0.55      0.46      0.50      5940



At this point, you would want to incorporate the validation tokens into the training set. I haven't done that here for the sake of time, but that would give the model more information and more data to train over.

Finally, let's test it on the test data.

In [0]:
test_tokens_indexed = convert_to_index(test_tokens, word2idx_load)
test_tags_indexed = convert_to_index(test_tags, tag2idx_train)

In [26]:
reloaded_model.eval()

with torch.no_grad():
    y_pred_test = predict(test_tokens_indexed, reloaded_model, idx2tag_load)

print("F1 score:", f1_score(test_tags, y_pred_test))
print("Accuracy:", accuracy_score(test_tags, y_pred_test))
print(classification_report(test_tags, y_pred_test))

F1 score: 0.45670610407287193
Accuracy: 0.8919623992065887
           precision    recall  f1-score   support

      ORG       0.49      0.34      0.40      1660
      LOC       0.78      0.67      0.72      1666
      PER       0.19      0.08      0.12      1615
     MISC       0.56      0.52      0.54       701

micro avg       0.56      0.39      0.46      5642
macro avg       0.50      0.39      0.43      5642



Not bad. Location is still the best tag overall.

