In [5]:
from os.path import join as pj
from random import shuffle
from typing import List, Dict, Tuple

import pandas as pd
import torch
import torch.nn as nn

DATA_FOLDER = pj('thesis', 'NLP_Course', 'HW3', 'data')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


def read_data(filename: str) -> List[List[str]]:
    data = []

    with open(pj(DATA_FOLDER, filename), 'r') as f:
        output = f.read().splitlines()

    words, tags = [], []
    for line in output:
        if line == '':
            data.append((words, tags))
            words, tags = [], []
            continue

        line_splitted = line.strip().split(' ')
        words.append(line_splitted[0])
        tags.append(line_splitted[1])

    if len(words) > 0:
        data.append((words, tags))

    return data


_train, _dev, _test = read_data('connl03_train.txt'), read_data('connl03_dev.txt'), read_data('connl03_test.txt')

UNK_TOKEN = 0


class Vocab:
    def __init__(self):
        self.word2id: Dict[str, int] = {"__unk__": UNK_TOKEN}
        self.id2word: Dict[int, str] = {UNK_TOKEN: "__unk__"}
        self.n_words = 1

        self.tag2id: Dict[str, int] = {
            "O": 0,
            "B-PER": 1,
            "I-PER": 2,
            "B-LOC": 3,
            "I-LOC": 4,
            "B-ORG": 5,
            "I-ORG": 6
        }
        self.id2tag: Dict[int, str] = {
            0: "O",
            1: "B-PER",
            2: "I-PER",
            3: "B-LOC",
            4: "I-LOC",
            5: "B-ORG",
            6: "I-ORG"
        }

    def index_words(self, words: List[str]) -> List[int]:
        word_indexes = [self.index_word(w) for w in words]
        return word_indexes

    def index_tags(self, tags: List[str]) -> List[int]:
        tag_indexes = [self.tag2id[t] for t in tags]
        return tag_indexes

    def index_word(self, w: str) -> int:
        if w not in self.word2id:
            self.word2id[w] = self.n_words
            self.id2word[self.n_words] = w
            self.n_words += 1
        return self.word2id[w]


_vocab = Vocab()


def prepare_data(data: List[Tuple[List[str], List[str]]],
                 vocab: Vocab) -> Tuple[List[Tuple[torch.LongTensor, torch.LongTensor]], Vocab]:
    data_sequences = []

    for words, tags in data:
        words_indexes = torch.LongTensor(vocab.index_words(words))
        tags_indexes = torch.LongTensor(vocab.index_tags(tags))

        data_sequences.append((words_indexes, tags_indexes))

    return data_sequences, vocab


_train_sequences, _vocab = prepare_data(data=_train, vocab=_vocab)
_dev_sequences, _vocab = prepare_data(data=_dev, vocab=_vocab)
_test_sequences, _vocab = prepare_data(data=_test, vocab=_vocab)


class NERNet(nn.Module):
    def __init__(self, input_size: int, embedding_size: int, hidden_size: int, output_size: int,
                 n_layers: int, directions: int):
        super(NERNet, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.directions = directions

        self.embedding = nn.Embedding(input_size, embedding_size)
        # bidirectional if directions==2 else 1
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=(directions == 2))
        self.out = nn.Linear(in_features=hidden_size * directions, out_features=output_size)

    def forward(self, input_sentence: torch.LongTensor) -> torch.Tensor:
        embeds = self.embedding(input_sentence)
        lstm_out, _ = self.lstm(embeds.view(input_sentence.shape[0], 1, -1))
        output = self.out(lstm_out.view(input_sentence.shape[0], -1))

        return output


def train_loop(model: NERNet, n_epochs: int, train_sequences: List[Tuple[torch.LongTensor, torch.LongTensor]]):
    shuffle(train_sequences)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    sentence_gpu, tags_gpu = [], []
    for sentence, tags in train_sequences:
        sentence_gpu.append(sentence.to(DEVICE))
        tags_gpu.append(tags.to(DEVICE))

    for _ in range(n_epochs):
        for sentence, tags in zip(sentence_gpu, tags_gpu):
            model.zero_grad()
            scores = model(sentence)
            criterion(scores, tags).backward()
            optimizer.step()


def evaluate(model: NERNet, caption: str, vocab: Vocab, dev_sequences: List[Tuple[torch.LongTensor, torch.LongTensor]]
             , test_sequences: List[Tuple[torch.LongTensor, torch.LongTensor]]):
    n_labels = len(vocab.tag2id)

    dev_matrix = torch.zeros(n_labels, n_labels, dtype=torch.float32)
    test_matrix = torch.zeros(n_labels, n_labels, dtype=torch.float32)

    with torch.no_grad():
        for inputs, labels in dev_sequences:
            preds = model(inputs.to(DEVICE)).max(1).indices
            for label, pred in zip(labels, preds):
                dev_matrix[label, pred] += 1

        for inputs, labels in test_sequences:
            preds = model(inputs.to(DEVICE)).max(1).indices
            for label, pred in zip(labels, preds):
                test_matrix[label, pred] += 1

    # Calculate Precision.
    dev_precision = dev_matrix.diag() / dev_matrix.sum(1)
    test_precision = test_matrix.diag() / test_matrix.sum(1)

    # Calculate Recall.
    dev_recall = dev_matrix.diag() / dev_matrix.sum(0)
    test_recall = test_matrix.diag() / test_matrix.sum(0)

    ndigits = 3  # For displaying purposes.
    df = pd.DataFrame(columns=vocab.tag2id.keys())

    df.loc['dev, Precision'] = [round(x, ndigits) for x in dev_precision.tolist()]
    df.loc['test, Precision'] = [round(x, ndigits) for x in test_precision.tolist()]

    df.loc['dev, Recall'] = [round(x, ndigits) for x in dev_recall.tolist()]
    df.loc['test, Recall'] = [round(x, ndigits) for x in test_recall.tolist()]

    # Add labels.
    dev_precision_except_0 = dev_matrix[1:, 1:].diag().sum() / dev_matrix[1:, 1:].sum(1).sum()
    test_precision_except_0 = test_matrix[1:, 1:].diag().sum() / test_matrix[1:, 1:].sum(1).sum()

    dev_recall_except_0 = dev_matrix[1:, 1:].diag().sum() / dev_matrix[1:, 1:].sum(0).sum()
    test_recall_except_0 = test_matrix[1:, 1:].diag().sum() / test_matrix[1:, 1:].sum(0).sum()

    df.loc['dev, Precision', 'All Except O'] = round(float(dev_precision_except_0), ndigits)
    df.loc['test, Precision', 'All Except O'] = round(float(test_precision_except_0), ndigits)

    df.loc['dev, Recall', 'All Except O'] = round(float(dev_recall_except_0), ndigits)
    df.loc['test, ''Recall', 'All Except O'] = round(float(test_recall_except_0), ndigits)

    df = df.fillna(0)

    print(caption)
    print(df)
    print()


def get_glove_weights(vocab: Vocab) -> torch.FloatTensor:
    with open(pj(DATA_FOLDER, 'glove.6B.300d.txt'), encoding='utf-8') as f:
        embeddings = torch.zeros((len(vocab.word2id), 300), dtype=torch.float32)
        for line in f.readlines():
            vals = line.split()
            idx = vocab.word2id.get(vals[0])
            if idx:
                embeddings[idx] = torch.FloatTensor([float(x) for x in vals[1:]])
        return embeddings

In [2]:

def main(vocab: Vocab, load_glove_weights: bool):
    if load_glove_weights:
        embedding_weights = get_glove_weights(vocab=_vocab)
    else:
        embedding_weights = None

    model_i = 0
    for hidden_size in [500, 800]:
        for n_layers in [1, 2, 3]:
            for directions in [1, 2] if hidden_size == 500 else [2]:
                model = NERNet(input_size=len(vocab.word2id), embedding_size=300,
                               hidden_size=hidden_size, output_size=len(vocab.tag2id),
                               n_layers=n_layers, directions=directions)

                if embedding_weights is not None:
                    model.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=True)

                model.to(DEVICE)

                train_loop(model, n_epochs=10, train_sequences=_train_sequences)
                model_i += 1
                caption = (f'Model {model_i} {"(GloVe)" if embedding_weights is not None else ""},'
                           f' hidden_size: {model.hidden_size}, n_layers: {model.n_layers},'
                           f' directions: {model.directions})')

                evaluate(model, caption, vocab=_vocab, dev_sequences=_dev_sequences, test_sequences=_test_sequences)

In [3]:
main(vocab=_vocab, load_glove_weights=False)

Model 1 , hidden_size: 500, n_layers: 1, directions: 1)
                     O  B-PER  I-PER  B-LOC  I-LOC  B-ORG  I-ORG  All Except O
dev, Precision   0.957  0.645  0.637  0.678  0.522  0.625  0.440         0.865
test, Precision  0.954  0.636  0.703  0.717  0.509  0.557  0.350         0.847
dev, Recall      0.924  0.713  0.800  0.800  0.800  0.571  0.671         0.865
test, Recall     0.930  0.706  0.819  0.786  0.931  0.509  0.504         0.847

Model 2 , hidden_size: 500, n_layers: 1, directions: 2)
                     O  B-PER  I-PER  B-LOC  I-LOC  B-ORG  I-ORG  All Except O
dev, Precision   0.972  0.710  0.688  0.770  0.435  0.613  0.397         0.862
test, Precision  0.973  0.694  0.655  0.764  0.585  0.597  0.390         0.889
dev, Recall      0.935  0.789  0.871  0.731  1.000  0.669  0.730         0.862
test, Recall     0.932  0.768  0.878  0.787  0.912  0.701  0.729         0.889

Model 3 , hidden_size: 500, n_layers: 2, directions: 1)
                     O  B-PER  I-PER  B-

In [4]:
main(vocab=_vocab, load_glove_weights=True)

Model 1 (GloVe), hidden_size: 500, n_layers: 1, directions: 1)
                     O  B-PER  I-PER  B-LOC  I-LOC  B-ORG  I-ORG  All Except O
dev, Precision   0.952  0.545  0.828  0.628  0.478  0.613  0.293         0.742
test, Precision  0.952  0.491  0.811  0.633  0.623  0.646  0.340         0.722
dev, Recall      0.945  0.801  0.602  0.752  0.550  0.414  0.654         0.742
test, Recall     0.955  0.813  0.562  0.689  0.508  0.425  0.708         0.722

Model 2 (GloVe), hidden_size: 500, n_layers: 1, directions: 2)
                     O  B-PER  I-PER  B-LOC  I-LOC  B-ORG  I-ORG  All Except O
dev, Precision   0.984  0.680  0.803  0.639  0.348  0.595  0.569         0.835
test, Precision  0.975  0.622  0.736  0.662  0.377  0.614  0.595         0.839
dev, Recall      0.943  0.866  0.926  0.755  0.444  0.671  0.688         0.835
test, Recall     0.941  0.854  0.897  0.767  0.541  0.683  0.513         0.839

Model 3 (GloVe), hidden_size: 500, n_layers: 2, directions: 1)
                   