In [0]:
from google.colab import drive
drive.mount("/gdrive")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-05-05 04:54:40--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-05-05 04:54:40--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-05-05 04:54:40--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [0]:
!mkdir data/
!unzip glove.6B.zip
!mv glove.6B.50d.txt data/

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
!pip install conllu
!git clone https://github.com/pasinit/nlp2020_POStagging_data.git
!unzip nlp2020_POStagging_data/r2.2.zip  > /dev/null
!rm -rf nlp2020_POStagging_data/

Collecting conllu
  Downloading https://files.pythonhosted.org/packages/a8/03/4a952eb39cdc8da80a6a2416252e71784dda6bf9d726ab98065fff2aeb73/conllu-2.3.2-py2.py3-none-any.whl
Installing collected packages: conllu
Successfully installed conllu-2.3.2
Cloning into 'nlp2020_POStagging_data'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 0), reused 5 (delta 0), pack-reused 0[K
Unpacking objects: 100% (5/5), done.


In [0]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
# from torchtext import data
# from torchtext.vocab import Vectors
from collections import defaultdict
from conllu import parse as conllu_parse
from pprint import pprint
from tqdm import tqdm
# from torchtext.vocab import Vocab
from collections import Counter
import random
import numpy as np

SEED = 123456

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# from torchtext.vocab import Vocab

In [0]:
training_file = "drive/My Drive/NLP_Assignment/train.tsv"
dev_file = "drive/My Drive/NLP_Assignment/dev.tsv"
test_file = "drive/My Drive/NLP_Assignment/test.tsv"

In [0]:
class man_made_Vocab(object):
    
    UNK = "<unk>"
    
    def __init__(self, counter, min_freq=1, specials=['<unk>', '<pad>']):
        
        counter = counter.copy()
        self.itos = list()
        self.unk_index = None
        self.itos = list(specials)
        
        for tok in specials:
            del counter[tok]
            
        words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
        words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
        
        for word, freq in words_and_frequencies:
            self.itos.append(word)
        
        if man_made_Vocab.UNK in specials:
            unk_index = specials.index(man_made_Vocab.UNK)
            self.unk_index = unk_index
            self.stoi = defaultdict(self._default_unk_index)
        else:
            self.stoi = defaultdict()
        
        self.stoi.update({tok: i for i, tok in enumerate(self.itos)})
        
    def _default_unk_index(self):
        return self.unk_index

    def __getstate__(self):
        # avoid picking defaultdict
        attrs = dict(self.__dict__)
        # cast to regular dict
        attrs['stoi'] = dict(self.stoi)
        return attrs

    def __setstate__(self, state):
        if state.get("unk_index", None) is None:
            stoi = defaultdict()
        else:
            stoi = defaultdict(self._default_unk_index)
        stoi.update(state['stoi'])
        state['stoi'] = stoi
        self.__dict__.update(state)
    
    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi.get(man_made_Vocab.UNK))
        
    def __len__(self):
        return len(self.itos)

In [0]:
class POSTaggingDataset(Dataset):

    def __init__(self, 
                 input_file:str, 
                 window_size:int, 
                 window_shift:int=-1,
                 lowercase=True, 
                 device="cuda"):
        """
        We assume that the dataset pointed by input_file is already tokenized 
        and can fit in memory.
        Args:
            input_file (string): The path to the dataset to be loaded.
            window_size (integer): The maximum length of a sentence in terms of 
            number of tokens.
            window_shift (integer): The number of tokens we shift the window 
            over the sentence. Default value is -1 meaning that the window will
            be shifted by window_size.
            lowercase (boolean): whether the text has to be lowercased or not.
            device (string): device where to put tensors (cpu or cuda).
        """

        self.input_file = input_file
        self.window_size = window_size
        self.window_shift = window_shift if window_shift > 0 else window_size
        self.lowercase = lowercase
        with open(input_file) as reader:
            # read the entire file with reader.read() e parse it
            sentences = conllu_parse(reader.read())
        self.device = device
        self.data = self.create_windows(sentences)
        self.encoded_data = None
    
    def index_dataset(self, l_vocabulary, l_label_vocabulary):
        self.encoded_data = list()
        for i in range(len(self.data)):
            # for each window
            elem = self.data[i]
            encoded_elem = torch.LongTensor(self.encode_text(elem, l_vocabulary)).to(self.device)
            # for each element d in the elem window (d is a dictionary with the various fields from the CoNLL line) 
            encoded_labels = torch.LongTensor([l_label_vocabulary.stoi[d["lemma"]] if d is not None 
                              else l_label_vocabulary.stoi["<pad>"] for d in elem]).to(self.device)
            self.encoded_data.append({"inputs":encoded_elem, "outputs":encoded_labels})

    def create_windows(self, sentences):
        """ 
        Args:
            sentences (list of lists of dictionaries, 
                          where each dictionary represents a word occurrence parsed from a CoNLL line)
        """
        data = []
        for sentence in sentences:
            if self.lowercase:
                for d in sentence:
                    # lowers the inflected form
                    d["form"] = d["form"].lower()
            for i in range(0, len(sentence), self.window_shift):
                window = sentence[i:i+self.window_size]
                if len(window) < self.window_size:
                    window = window + [None]*(self.window_size - len(window))
                assert len(window) == self.window_size
                data.append(window)
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.encoded_data is None:
            raise RuntimeError("""Trying to retrieve elements but index_dataset
            has not been invoked yet! Be sure to invoce index_dataset on this object
            before trying to retrieve elements. In case you want to retrieve raw
            elements, use the method get_raw_element(idx)""")
        return self.encoded_data[idx]
    
    def get_raw_element(self, idx):
        return self.data[idx]

    @staticmethod
    def encode_text(sentence:list, l_vocabulary):
        """
        Args:
            sentences (list): list of OrderedDict, each carrying the information about
            one token.
            l_vocabulary (Vocab): vocabulary with mappings from words to indices and viceversa.
        Return:
            The method returns a list of indices corresponding to the input tokens.
        """
        indices = list()
        for w in sentence:
            if w is None:
                indices.append(l_vocabulary.stoi["<pad>"])
            elif w["form"] in l_vocabulary.stoi: # vocabulary string to integer
                indices.append(l_vocabulary.stoi[w["form"]])
            else:
                indices.append(l_vocabulary.stoi["<unk>"])
        return indices
    
    @staticmethod
    def decode_output(outputs:torch.Tensor,
                    l_label_vocabulary):
        """
        Args:
            outputs (Tensor): a Tensor with shape (batch_size, max_len, label_vocab_size)
                containing the logits outputed by the neural network.
            l_label_vocabulary (Vocab): is the vocabulary containing the mapping from
            a string label to its corresponding index and vice versa
        Output:
            The method returns a list of batch_size length where each element is a list
            of labels, one for each input token.
        """
        max_indices = torch.argmax(outputs, -1).tolist() # shape = (batch_size, max_len)
        predictions = list()
        for indices in max_indices:
            # vocabulary integer to string is used to obtain the corresponding word from the max index
            predictions.append([l_label_vocabulary.itos[i] for i in indices])
        return predictions

In [0]:
def build_vocab(dataset, min_freq=1):
    counter = Counter()
    for i in tqdm(range(len(dataset))):
        # for each token in the sentence viewed as a dictionary of items from the CoNLL line
        for token in dataset.get_raw_element(i):
            if token is not None:
                counter[token["form"]]+=1
    # we add special tokens for handling padding and unknown words at testing time.
    return man_made_Vocab(counter, min_freq=min_freq, specials=['<pad>', '<unk>'])

def build_label_vocab(dataset):
    counter = Counter()
    for i in tqdm(range(len(dataset))):
        for token in dataset.get_raw_element(i):
            if token is not None:
                counter[token["lemma"]]+=1
    # No <unk> token for labels.
    return man_made_Vocab(counter, specials=['<pad>'])

In [0]:
window_size, window_shift = 100, 100
dataset = POSTaggingDataset(training_file, window_size, window_shift)
vocabulary = build_vocab(dataset, min_freq=2)
label_vocabulary = build_label_vocab(dataset)
dataset.index_dataset(vocabulary, label_vocabulary)
print(len(vocabulary))
print(len(label_vocabulary))

100%|██████████| 100042/100042 [00:02<00:00, 49628.17it/s]
100%|██████████| 100042/100042 [00:01<00:00, 85494.22it/s]


91582
5


In [0]:
import six
from six.moves.urllib.request import urlretrieve

def reporthook(t):
    """https://github.com/tqdm/tqdm"""
    last_b = [0]

    def inner(b=1, bsize=1, tsize=None):
        """
        b: int, optional
        Number of blocks just transferred [default: 1].
        bsize: int, optional
        Size of each block (in tqdm units) [default: 1].
        tsize: int, optional
        Total size (in tqdm units). If [default: None] remains unchanged.
        """
        if tsize is not None:
            t.total = tsize
        t.update((b - last_b[0]) * bsize)
        last_b[0] = b
    return inner
  
def _infer_shape(f):
    num_lines, vector_dim = 0, None
    for line in f:
        if vector_dim is None:
            row = line.rstrip().split(b" ")
            vector = row[1:]
            # Assuming word, [vector] format
            if len(vector) > 2:
                # The header present in some (w2v) formats contains two elements.
                vector_dim = len(vector)
                num_lines += 1  # First element read
        else:
            num_lines += 1
    f.seek(0)
    return num_lines, vector_dim

class Vectorization(object):

    def __init__(self, name, cache=None,
                 url=None, unk_init=None, max_vectors=None):
        """
        Arguments:
           name: name of the file that contains the vectors
           cache: directory for cached vectors
           url: url for download if vectors not found in cache
           unk_init (callback): by default, initialize out-of-vocabulary word vectors
               to zero vectors; can be any function that takes in a Tensor and
               returns a Tensor of the same size
           max_vectors (int): this can be used to limit the number of
               pre-trained vectors loaded.
               Most pre-trained vector sets are sorted
               in the descending order of word frequency.
               Thus, in situations where the entire set doesn't fit in memory,
               or is not needed for another reason, passing `max_vectors`
               can limit the size of the loaded set.
        """
        cache = '.vector_cache' if cache is None else cache
        self.itos = None
        self.stoi = None
        self.vectors = None
        self.dim = None
        self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init
        self.cache(name, cache, url=url, max_vectors=max_vectors)

    def __getitem__(self, token):
        if token in self.stoi:
            return self.vectors[self.stoi[token]]
        else:
            return self.unk_init(torch.Tensor(self.dim))

    def cache(self, name, cache, url=None, max_vectors=None):
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
        if os.path.isfile(name):
            path = name
            if max_vectors:
                file_suffix = '_{}.pt'.format(max_vectors)
            else:
                file_suffix = '.pt'
            path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix
        else:
            path = os.path.join(cache, name)
            if max_vectors:
                file_suffix = '_{}.pt'.format(max_vectors)
            else:
                file_suffix = '.pt'
            path_pt = path + file_suffix

        if not os.path.isfile(path_pt):
            if not os.path.isfile(path) and url:
                # logger.info('Downloading vectors from {}'.format(url))
                if not os.path.exists(cache):
                    os.makedirs(cache)
                dest = os.path.join(cache, os.path.basename(url))
                if not os.path.isfile(dest):
                    with tqdm(unit='B', unit_scale=True, miniters=1, desc=dest) as t:
                        try:
                            urlretrieve(url, dest, reporthook=reporthook(t))
                        except KeyboardInterrupt as e:  # remove the partial zip file
                            os.remove(dest)
                            raise e
                # logger.info('Extracting vectors into {}'.format(cache))
                ext = os.path.splitext(dest)[1][1:]
                if ext == 'zip':
                    with zipfile.ZipFile(dest, "r") as zf:
                        zf.extractall(cache)
                elif ext == 'gz':
                    if dest.endswith('.tar.gz'):
                        with tarfile.open(dest, 'r:gz') as tar:
                            tar.extractall(path=cache)
            if not os.path.isfile(path):
                raise RuntimeError('no vectors found at {}'.format(path))

            # logger.info("Loading vectors from {}".format(path))
            ext = os.path.splitext(path)[1][1:]
            if ext == 'gz':
                open_file = gzip.open
            else:
                open_file = open

            vectors_loaded = 0
            with open_file(path, 'rb') as f:
                num_lines, dim = _infer_shape(f)
                if not max_vectors or max_vectors > num_lines:
                    max_vectors = num_lines

                itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None

                for line in tqdm(f, total=max_vectors):
                    # Explicitly splitting on " " is important, so we don't
                    # get rid of Unicode non-breaking spaces in the vectors.
                    entries = line.rstrip().split(b" ")

                    word, entries = entries[0], entries[1:]
                    if dim is None and len(entries) > 1:
                        dim = len(entries)
                    elif len(entries) == 1:
                        # logger.warning("Skipping token {} with 1-dimensional "
                                      #  "vector {}; likely a header".format(word, entries))
                        continue
                    elif dim != len(entries):
                        raise RuntimeError(
                            "Vector for token {} has {} dimensions, but previously "
                            "read vectors have {} dimensions. All vectors must have "
                            "the same number of dimensions.".format(word, len(entries),
                                                                    dim))

                    try:
                        if isinstance(word, six.binary_type):
                            word = word.decode('utf-8')
                    except UnicodeDecodeError:
                        # logger.info("Skipping non-UTF8 token {}".format(repr(word)))
                        continue

                    vectors[vectors_loaded] = torch.tensor([float(x) for x in entries])
                    vectors_loaded += 1
                    itos.append(word)

                    if vectors_loaded == max_vectors:
                        break

            self.itos = itos
            self.stoi = {word: i for i, word in enumerate(itos)}
            self.vectors = torch.Tensor(vectors).view(-1, dim)
            self.dim = dim
            # logger.info('Saving vectors to {}'.format(path_pt))
            if not os.path.exists(cache):
                os.makedirs(cache)
            torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt)
        else:
            # logger.info('Loading vectors from {}'.format(path_pt))
            self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)

    def __len__(self):
        return len(self.vectors)

    def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
        """Look up embedding vectors of tokens.

        Arguments:
            tokens: a token or a list of tokens. if `tokens` is a string,
                returns a 1-D tensor of shape `self.dim`; if `tokens` is a
                list of strings, returns a 2-D tensor of shape=(len(tokens),
                self.dim).
            lower_case_backup : Whether to look up the token in the lower case.
                If False, each token in the original case will be looked up;
                if True, each token in the original case will be looked up first,
                if not found in the keys of the property `stoi`, the token in the
                lower case will be looked up. Default: False.

        Examples:
            >>> examples = ['chip', 'baby', 'Beautiful']
            >>> vec = text.vocab.GloVe(name='6B', dim=50)
            >>> ret = vec.get_vecs_by_tokens(tokens, lower_case_backup=True)
        """
        to_reduce = False

        if not isinstance(tokens, list):
            tokens = [tokens]
            to_reduce = True

        if not lower_case_backup:
            indices = [self[token] for token in tokens]
        else:
            indices = [self[token] if token in self.stoi
                       else self[token.lower()]
                       for token in tokens]

        vecs = torch.stack(indices)
        return vecs[0] if to_reduce else vecs


In [0]:
class POSTaggerModel(nn.Module):
    # we provide the hyperparameters as input
    def __init__(self, hparams):
        super(POSTaggerModel, self).__init__()
        # Embedding layer: a mat∂rix vocab_size x embedding_dim where each index 
        # correspond to a word in the vocabulary and the i-th row corresponds to 
        # a latent representation of the i-th word in the vocabulary.
        pprint(params)
        self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)
        if hparams.embeddings is not None:
            print("initializing embeddings from pretrained")
            self.word_embedding.weight.data.copy_(hparams.embeddings)

        # LSTM layer: an LSTM neural network that process the input text
        # (encoded with word embeddings) from left to right and outputs 
        # a new **contextual** representation of each word that depend
        # on the preciding words.
        self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim, 
                            bidirectional=hparams.bidirectional,
                            num_layers=hparams.num_layers, 
                            dropout = hparams.dropout if hparams.num_layers > 1 else 0)
        # Hidden layer: transforms the input value/scalar into
        # a hidden vector representation.
        lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2

        # During training, randomly zeroes some of the elements of the 
        # input tensor with probability hparams.dropout using samples 
        # from a Bernoulli distribution. Each channel will be zeroed out 
        # independently on every forward call.
        # This has proven to be an effective technique for regularization and 
        # preventing the co-adaptation of neurons
        self.dropout = nn.Dropout(hparams.dropout)
        self.classifier = nn.Linear(lstm_output_dim, hparams.num_classes)

    
    def forward(self, x):
        embeddings = self.word_embedding(x)
        embeddings = self.dropout(embeddings)
        o, (h, c) = self.lstm(embeddings)
        o = self.dropout(o)
        output = self.classifier(o)
        return output

In [0]:
class Trainer():
    """Utility class to train and evaluate a model."""

    def __init__(
        self,
        model: nn.Module,
        loss_function,
        optimizer,
        label_vocab,
        log_steps:int=10_000,
        log_level:int=2):
        """
        Args:
            model: the model we want to train.
            loss_function: the loss_function to minimize.
            optimizer: the optimizer used to minimize the loss_function.
        """
        self.model = model
        self.loss_function = loss_function
        self.optimizer = optimizer

        self.label_vocab = label_vocab
        self.log_steps = log_steps
        self.log_level = log_level
        self.label_vocab = label_vocab

    def train(self, train_dataset:Dataset, 
              valid_dataset:Dataset, 
              epochs:int=1):
        """
        Args:
            train_dataset: a Dataset or DatasetLoader instance containing
                the training instances.
            valid_dataset: a Dataset or DatasetLoader instance used to evaluate
                learning progress.
            epochs: the number of times to iterate over train_dataset.

        Returns:
            avg_train_loss: the average training loss on train_dataset over
                epochs.
        """
        assert epochs > 1 and isinstance(epochs, int)
        if self.log_level > 0:
            print('Training ...')
        train_loss = 0.0
        for epoch in range(epochs):
            if self.log_level > 0:
                print(' Epoch {:03d}'.format(epoch + 1))

            epoch_loss = 0.0
            self.model.train()

            for step, sample in enumerate(train_dataset):
                inputs = sample['inputs']
                labels = sample['outputs']
                self.optimizer.zero_grad()

                predictions = self.model(inputs)
                predictions = predictions.view(-1, predictions.shape[-1])
                labels = labels.view(-1)
                
                sample_loss = self.loss_function(predictions, labels)
                sample_loss.backward()
                self.optimizer.step()

                epoch_loss += sample_loss.tolist()

                if self.log_level > 1 and step % self.log_steps == self.log_steps - 1:
                    print('\t[E: {:2d} @ step {}] current avg loss = {:0.4f}'.format(epoch, step, epoch_loss / (step + 1)))
            
            avg_epoch_loss = epoch_loss / len(train_dataset)
            train_loss += avg_epoch_loss
            if self.log_level > 0:
                print('\t[E: {:2d}] train loss = {:0.4f}'.format(epoch, avg_epoch_loss))

            valid_loss = self.evaluate(valid_dataset)
            
            if self.log_level > 0:
                print('  [E: {:2d}] valid loss = {:0.4f}'.format(epoch, valid_loss))

        if self.log_level > 0:
            print('... Done!')
        
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss
    

    def evaluate(self, valid_dataset):
        """
        Args:
            valid_dataset: the dataset to use to evaluate the model.

        Returns:
            avg_valid_loss: the average validation loss over valid_dataset.
        """
        valid_loss = 0.0
        # set dropout to 0!! Needed when we are in inference mode.
        self.model.eval()
        with torch.no_grad():
            for sample in valid_dataset:
                inputs = sample['inputs']
                labels = sample['outputs']

                predictions = self.model(inputs)
                predictions = predictions.view(-1, predictions.shape[-1])
                labels = labels.view(-1)
                sample_loss = self.loss_function(predictions, labels)
                valid_loss += sample_loss.tolist()
        
        return valid_loss / len(valid_dataset)

    def predict(self, x):
        """
        Args:
            x: a tensor of indices.
        Returns: 
            A list containing the predicted POS tag for each token in the
            input sentences.
        """
        self.model.eval()
        with torch.no_grad():
            logits = self.model(x)
            predictions = torch.argmax(logits, -1)
            return logits, predictions

In [0]:
class HParams():
    vocab_size = len(vocabulary)
    hidden_dim = 128
    embedding_dim = 100
    num_classes = len(label_vocabulary) # number of different universal POS tags
    bidirectional = True
    num_layers = 2
    dropout = 0.0
    embeddings = None
params = HParams()

In [0]:
import os
vectors = Vectorization("glove.6B.100d.txt")
pretrained_embeddings = torch.randn(len(vocabulary), vectors.dim)
initialised = 0
for i, w in enumerate(vocabulary.itos):
    if w in vectors.stoi:
        initialised += 1
        vec = vectors.get_vecs_by_tokens(w)
        pretrained_embeddings[i] = vec
    
pretrained_embeddings[vocabulary["<pad>"]] = torch.zeros(vectors.dim)
params.embedding_dim=vectors.dim
params.embeddings = pretrained_embeddings
params.vocab_size = len(vocabulary)

In [0]:
window_size, window_shift = 100, 100
device = "cuda"
trainingset = POSTaggingDataset(training_file, window_size, window_shift, device=device)
vocabulary = build_vocab(trainingset, min_freq=2)
label_vocabulary = build_label_vocab(trainingset)
trainingset.index_dataset(vocabulary, label_vocabulary)

devset = POSTaggingDataset(dev_file, window_size, window_shift, device=device)
vocabulary = build_vocab(devset, min_freq=2)
label_vocabulary = build_label_vocab(devset)
devset.index_dataset(vocabulary, label_vocabulary)

testset = POSTaggingDataset(test_file, window_size, window_shift, device=device)
vocabulary = build_vocab(testset, min_freq=2)
label_vocabulary = build_label_vocab(testset)
testset.index_dataset(vocabulary, label_vocabulary)

train_dataset = DataLoader(trainingset, batch_size=128)
valid_dataset = DataLoader(devset, batch_size=128)
test_dataset = DataLoader(testset, batch_size=128)

postagger = POSTaggerModel(params).cuda()

100%|██████████| 100042/100042 [00:02<00:00, 49371.11it/s]
100%|██████████| 100042/100042 [00:01<00:00, 85523.49it/s]
100%|██████████| 14439/14439 [00:00<00:00, 49562.05it/s]
100%|██████████| 14439/14439 [00:00<00:00, 76007.11it/s]
100%|██████████| 15480/15480 [00:00<00:00, 53056.97it/s]
100%|██████████| 15480/15480 [00:00<00:00, 81367.98it/s]


<__main__.HParams object at 0x7f5dc0179518>
initializing embeddings from pretrained


In [0]:
bilstm_trainer = Trainer(
    model = postagger,
    loss_function = nn.CrossEntropyLoss(ignore_index=label_vocabulary["<pad>"]),
    optimizer = optim.Adam(postagger.parameters()),
    label_vocab=label_vocabulary
)

In [0]:
bilstm_trainer.train(train_dataset, valid_dataset, 20) #EPOCH1:13:44

Training ...
 Epoch 001
	[E:  0] train loss = 0.1428
  [E:  0] valid loss = 0.7351
 Epoch 002
	[E:  1] train loss = 0.1291
  [E:  1] valid loss = 0.7931
 Epoch 003
	[E:  2] train loss = 0.1215
  [E:  2] valid loss = 0.8496
 Epoch 004
	[E:  3] train loss = 0.1158
  [E:  3] valid loss = 0.9181
 Epoch 005
	[E:  4] train loss = 0.1097
  [E:  4] valid loss = 1.0028
 Epoch 006
	[E:  5] train loss = 0.1016
  [E:  5] valid loss = 1.1282
 Epoch 007
	[E:  6] train loss = 0.0913
  [E:  6] valid loss = 1.2931
 Epoch 008
	[E:  7] train loss = 0.0797
  [E:  7] valid loss = 1.5219
 Epoch 009
	[E:  8] train loss = 0.0685
  [E:  8] valid loss = 1.7686
 Epoch 010
	[E:  9] train loss = 0.0602
  [E:  9] valid loss = 1.9682
 Epoch 011
	[E: 10] train loss = 0.0538
  [E: 10] valid loss = 2.2911
 Epoch 012
	[E: 11] train loss = 0.0462
  [E: 11] valid loss = 2.5962
 Epoch 013
	[E: 12] train loss = 0.0392
  [E: 12] valid loss = 2.8177
 Epoch 014
	[E: 13] train loss = 0.0340
  [E: 13] valid loss = 2.9848
 Epoch 

0.0662009265204139

In [0]:
from sklearn.metrics import precision_score as sk_precision
def compute_precision(model, l_dataset, l_label_vocab):
    all_predictions = list()
    all_labels = list()
    for indexed_elem in l_dataset:
        indexed_in = indexed_elem["inputs"]
        indexed_labels = indexed_elem["outputs"]
        predictions = model(indexed_in)
        predictions = torch.argmax(predictions, -1).view(-1)
        labels = indexed_labels.view(-1)
        valid_indices = labels != 0
        
        valid_predictions = predictions[valid_indices]
        valid_labels = labels[valid_indices]
        
        all_predictions.extend(valid_predictions.tolist())
        all_labels.extend(valid_labels.tolist())
    # global precision. Does take class imbalance into account.
    micro_precision = sk_precision(all_labels, all_predictions, average="micro", zero_division=0)
    # precision per class and arithmetic average of them. Does not take into account class imbalance.
    macro_precision = sk_precision(all_labels, all_predictions, average="macro", zero_division=0)
    per_class_precision = sk_precision(all_labels, all_predictions, labels = list(range(len(l_label_vocab))), average=None, zero_division=0)
    
    return {"micro_precision":micro_precision,
            "macro_precision":macro_precision, 
            "per_class_precision":per_class_precision}

In [0]:
precisions = compute_precision(postagger, test_dataset, label_vocabulary)
per_class_precision = precisions["per_class_precision"]
print("Micro Precision: {}\nMacro Precision: {}".format(precisions["micro_precision"], precisions["macro_precision"]))
print("Per class Precision:")
for idx_class, precision in sorted(enumerate(per_class_precision), key=lambda elem: -elem[1]):
    label = label_vocabulary.itos[idx_class]
    print(label, precision)

Micro Precision: 0.8323316472211231
Macro Precision: 0.28714675767783454
Per class Precision:
O 0.9092737950909858
PER 0.12721224218230207
LOC 0.0699726462543016
ORG 0.042128347183748846
<pad> 0.0


In [0]:
test_set_loss = bilstm_trainer.evaluate(test_dataset)
print("test set loss: {}".format(test_set_loss))

test set loss: 3.94698133547444


In [0]:
def print_outputs(l_trainer, l_testset, num_outputs, l_vocabulary, l_label_vocabulary):
    
    for i in range(num_outputs):
        print("sentence {}".format(i))
        print()
        test_elem = l_testset[i]

        test_x, test_y = test_elem["inputs"], test_elem["outputs"]
        
        logits, predictions = l_trainer.predict(test_x.unsqueeze(0))
        
        decoded_labels = POSTaggingDataset.decode_output(logits, l_label_vocabulary)[0]
        test_y = test_y.tolist()
        print("token\t\tinput\t\tgold\t\tprediction")
        print("-"*100)
        for raw_elem, idx, label, predicted_label in zip(l_testset.get_raw_element(i), test_x.tolist(), test_y, decoded_labels):
            if idx == 0:
                break
            print("{}\t\t{}\t\t{}\t\t{}".format(raw_elem["form"], l_vocabulary.itos[idx], l_label_vocabulary.itos[label], predicted_label))
        print("="*30)

print_outputs(bilstm_trainer, testset, 3, vocabulary, label_vocabulary)

sentence 0

token		input		gold		prediction
----------------------------------------------------------------------------------------------------
however		however		O		O
,		,		O		O
on		on		O		O
may		may		O		O
8th		8th		O		PER
,		,		O		O
2010		2010		O		O
,		,		O		O
a		a		O		O
sighting		sighting		O		O
of		of		O		O
a		a		O		O
gray		gray		O		O
whale		whale		O		O
was		was		O		O
confirmed		confirmed		O		O
off		off		O		O
the		the		O		O
coast		coast		O		O
of		of		O		O
israel		israel		LOC		O
in		in		O		O
the		the		O		O
mediterranean		mediterranean		LOC		O
sea		sea		LOC		O
.		.		O		O
,		,		O		O
leading		leading		O		O
some		some		O		O
scientists		scientists		O		O
to		to		O		O
think		think		O		LOC
they		they		O		O
might		might		O		O
be		be		O		O
repopulating		repopulating		O		O
old		old		O		O
breeding		breeding		O		PER
grounds		grounds		O		LOC
that		that		O		O
have		have		O		O
not		not		O		O
been		been		O		O
used		used		O		O
for		for		O		O
centuries		centuries		O		PER
.		.		O		O
sentence 1

token		in