In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-05-20 13:01:39--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-05-20 13:01:40--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-05-20 13:01:41--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [0]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
# !unzip glove.6B.zip
!pip install conllu
!pip install spacy
!pip install spacy-conll==1.3.0
# !git clone https://github.com/pasinit/nlp2020_POStagging_data.git
# !unzip nlp2020_POStagging_data/r2.2.zip  > /dev/null
# !rm -rf nlp2020_POStagging_data/

Collecting conllu
  Downloading https://files.pythonhosted.org/packages/a8/03/4a952eb39cdc8da80a6a2416252e71784dda6bf9d726ab98065fff2aeb73/conllu-2.3.2-py2.py3-none-any.whl
Installing collected packages: conllu
Successfully installed conllu-2.3.2
Collecting spacy-conll==1.3.0
  Downloading https://files.pythonhosted.org/packages/16/43/8e979338259a0e8e1881d7cfc01d4fbc16bc43d89a00151a367a1e821be8/spacy_conll-1.3.0-py3-none-any.whl
Installing collected packages: spacy-conll
Successfully installed spacy-conll-1.3.0


In [0]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
# from torchtext import data
# from torchtext.vocab import Vectors
from collections import defaultdict
from conllu import parse as conllu_parse
from pprint import pprint
from tqdm import tqdm
# from torchtext.vocab import Vocab
from collections import Counter
import csv
import random
import numpy as np
import spacy
from spacy_conll import ConllFormatter
from spacy_conll import Spacy2ConllParser
SEED = 123456

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# from torchtext.vocab import Vocab

In [0]:
training_file = "drive/My Drive/NLP_Assignment/train.tsv"
dev_file = "drive/My Drive/NLP_Assignment/dev.tsv"
test_file = "drive/My Drive/NLP_Assignment/test.tsv"

In [0]:

# TSV & CONLLU
def file2str(file_o):
    input_str = list()
    with open(test_file) as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="#")
        for line in tsvreader:
            if(len(line[1:]) != 0):
                input_str.append(line[1:][0][1:])
    return ' '.join(input_str)

train_str = file2str(training_file)
dev_str = file2str(dev_file)
test_str = file2str(test_file)

spacyconll = Spacy2ConllParser()
spacyconll.parseprint(input_str=''.join(train_str), output_file="train.conllu")
spacyconll.parseprint(input_str=''.join(dev_str), output_file="dev.conllu")
spacyconll.parseprint(input_str=''.join(test_str), output_file="test.conllu")

In [0]:
class man_made_Vocab(object):
    
    UNK = "<unk>"
    
    def __init__(self, counter, min_freq=1, specials=['<unk>', '<pad>']):
        
        counter = counter.copy()
        self.itos = list()
        self.unk_index = None
        self.itos = list(specials)
        
        for tok in specials:
            del counter[tok]
            
        words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
        words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
        
        for word, freq in words_and_frequencies:
            self.itos.append(word)
        
        if man_made_Vocab.UNK in specials:
            unk_index = specials.index(man_made_Vocab.UNK)
            self.unk_index = unk_index
            self.stoi = defaultdict(self._default_unk_index)
        else:
            self.stoi = defaultdict()
        
        self.stoi.update({tok: i for i, tok in enumerate(self.itos)})
        
    def _default_unk_index(self):
        return self.unk_index

    def __getstate__(self):
        # avoid picking defaultdict
        attrs = dict(self.__dict__)
        # cast to regular dict
        attrs['stoi'] = dict(self.stoi)
        return attrs

    def __setstate__(self, state):
        if state.get("unk_index", None) is None:
            stoi = defaultdict()
        else:
            stoi = defaultdict(self._default_unk_index)
        stoi.update(state['stoi'])
        state['stoi'] = stoi
        self.__dict__.update(state)
    
    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi.get(man_made_Vocab.UNK))
        
    def __len__(self):
        return len(self.itos)

In [0]:
class POSTaggingDataset(Dataset):

    def __init__(self, 
                 input_file:str, 
                 window_size:int, 
                 window_shift:int=-1,
                 lowercase=True, 
                 device="cpu"):
        """
        We assume that the dataset pointed by input_file is already tokenized 
        and can fit in memory.
        Args:
            input_file (string): The path to the dataset to be loaded.
            window_size (integer): The maximum length of a sentence in terms of 
            number of tokens.
            window_shift (integer): The number of tokens we shift the window 
            over the sentence. Default value is -1 meaning that the window will
            be shifted by window_size.
            lowercase (boolean): whether the text has to be lowercased or not.
            device (string): device where to put tensors (cpu or cuda).
        """

        self.input_file = input_file
        self.window_size = window_size
        self.window_shift = window_shift if window_shift > 0 else window_size
        self.lowercase = lowercase
        with open(input_file) as reader:
            # read the entire file with reader.read() e parse it
            sentences = conllu_parse(reader.read())
        self.device = device
        self.data = self.create_windows(sentences)
        self.encoded_data = None
    
    def index_dataset(self, l_vocabulary, l_label_vocabulary):
        self.encoded_data = list()
        for i in range(len(self.data)):
            # for each window
            elem = self.data[i]
            encoded_elem = torch.LongTensor(self.encode_text(elem, l_vocabulary)).to(self.device)
            # for each element d in the elem window (d is a dictionary with the various fields from the CoNLL line) 
            encoded_labels = torch.LongTensor([l_label_vocabulary.stoi[d["upostag"]] if d is not None 
                              else l_label_vocabulary.stoi["<pad>"] for d in elem]).to(self.device)
            self.encoded_data.append({"inputs":encoded_elem, "outputs":encoded_labels})

    def create_windows(self, sentences):
        """ 
        Args:
            sentences (list of lists of dictionaries, 
                          where each dictionary represents a word occurrence parsed from a CoNLL line)
        """
        data = []
        for sentence in sentences:
            if self.lowercase:
                for d in sentence:
                    # lowers the inflected form
                    d["form"] = d["form"].lower()
            for i in range(0, len(sentence), self.window_shift):
                window = sentence[i:i+self.window_size]
                if len(window) < self.window_size:
                    window = window + [None]*(self.window_size - len(window))
                assert len(window) == self.window_size
                data.append(window)
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.encoded_data is None:
            raise RuntimeError("""Trying to retrieve elements but index_dataset
            has not been invoked yet! Be sure to invoce index_dataset on this object
            before trying to retrieve elements. In case you want to retrieve raw
            elements, use the method get_raw_element(idx)""")
        return self.encoded_data[idx]
    
    def get_raw_element(self, idx):
        return self.data[idx]

    @staticmethod
    def encode_text(sentence:list, l_vocabulary):
        """
        Args:
            sentences (list): list of OrderedDict, each carrying the information about
            one token.
            l_vocabulary (Vocab): vocabulary with mappings from words to indices and viceversa.
        Return:
            The method returns a list of indices corresponding to the input tokens.
        """
        indices = list()
        for w in sentence:
            if w is None:
                indices.append(l_vocabulary.stoi["<pad>"])
            elif w["form"] in l_vocabulary.stoi: # vocabulary string to integer
                indices.append(l_vocabulary.stoi[w["form"]])
            else:
                indices.append(l_vocabulary.stoi["<unk>"])
        return indices
    
    @staticmethod
    def decode_output(outputs:torch.Tensor,
                    l_label_vocabulary):
        """
        Args:
            outputs (Tensor): a Tensor with shape (batch_size, max_len, label_vocab_size)
                containing the logits outputed by the neural network.
            l_label_vocabulary (Vocab): is the vocabulary containing the mapping from
            a string label to its corresponding index and vice versa
        Output:
            The method returns a list of batch_size length where each element is a list
            of labels, one for each input token.
        """
        max_indices = torch.argmax(outputs, -1).tolist() # shape = (batch_size, max_len)
        predictions = list()
        print(max_indices)
        for indices in max_indices:
            # vocabulary integer to string is used to obtain the corresponding word from the max index
            predictions.append([l_label_vocabulary.itos[i] for i in indices])
        print(predictions)
        return predictions

In [0]:
def build_vocab(dataset, min_freq=1):
    counter = Counter()
    for i in tqdm(range(len(dataset))):
        # for each token in the sentence viewed as a dictionary of items from the CoNLL line
        for token in dataset.get_raw_element(i):
            if token is not None:
                counter[token["form"]]+=1
    # we add special tokens for handling padding and unknown words at testing time.
    return man_made_Vocab(counter, min_freq=min_freq, specials=['<pad>', '<unk>'])

def build_label_vocab(dataset):
    counter = Counter()
    for i in tqdm(range(len(dataset))):
        for token in dataset.get_raw_element(i):
            if token is not None:
                counter[token["upostag"]]+=1
    # No <unk> token for labels.
    return man_made_Vocab(counter, specials=['<pad>'])

In [0]:
window_size, window_shift = 100, 100
dataset = POSTaggingDataset("train.conllu", window_size, window_shift)
vocabulary = build_vocab(dataset, min_freq=2)
label_vocabulary = build_label_vocab(dataset)
dataset.index_dataset(vocabulary, label_vocabulary)
print(len(vocabulary))
print(len(label_vocabulary))

100%|██████████| 16016/16016 [00:00<00:00, 49367.38it/s]
100%|██████████| 16016/16016 [00:00<00:00, 73467.43it/s]


34798
18


In [0]:
class POSTaggerModel(nn.Module):
    # we provide the hyperparameters as input
    def __init__(self, hparams):
        super(POSTaggerModel, self).__init__()
        # Embedding layer: a mat∂rix vocab_size x embedding_dim where each index 
        # correspond to a word in the vocabulary and the i-th row corresponds to 
        # a latent representation of the i-th word in the vocabulary.
        pprint(params)
        self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)
        if hparams.embeddings is not None:
            print("initializing embeddings from pretrained")
            self.word_embedding.weight.data.copy_(hparams.embeddings)

        # LSTM layer: an LSTM neural network that process the input text
        # (encoded with word embeddings) from left to right and outputs 
        # a new **contextual** representation of each word that depend
        # on the preciding words.
        self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim, 
                            bidirectional=hparams.bidirectional,
                            num_layers=hparams.num_layers, 
                            dropout = hparams.dropout if hparams.num_layers > 1 else 0)
        # Hidden layer: transforms the input value/scalar into
        # a hidden vector representation.
        lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2

        # During training, randomly zeroes some of the elements of the 
        # input tensor with probability hparams.dropout using samples 
        # from a Bernoulli distribution. Each channel will be zeroed out 
        # independently on every forward call.
        # This has proven to be an effective technique for regularization and 
        # preventing the co-adaptation of neurons
        self.dropout = nn.Dropout(hparams.dropout)
        self.classifier = nn.Linear(lstm_output_dim, hparams.num_classes)

    
    def forward(self, x):
        embeddings = self.word_embedding(x)
        embeddings = self.dropout(embeddings)
        o, (h, c) = self.lstm(embeddings)
        o = self.dropout(o)
        output = self.classifier(o)
        return output

In [0]:
class Trainer():
    """Utility class to train and evaluate a model."""

    def __init__(
        self,
        model: nn.Module,
        loss_function,
        optimizer,
        label_vocab,
        log_steps:int=10_000,
        log_level:int=2):
        """
        Args:
            model: the model we want to train.
            loss_function: the loss_function to minimize.
            optimizer: the optimizer used to minimize the loss_function.
        """
        self.model = model
        self.loss_function = loss_function
        self.optimizer = optimizer

        self.label_vocab = label_vocab
        self.log_steps = log_steps
        self.log_level = log_level
        self.label_vocab = label_vocab

    def train(self, train_dataset:Dataset, 
              valid_dataset:Dataset, 
              epochs:int=1):
        """
        Args:
            train_dataset: a Dataset or DatasetLoader instance containing
                the training instances.
            valid_dataset: a Dataset or DatasetLoader instance used to evaluate
                learning progress.
            epochs: the number of times to iterate over train_dataset.

        Returns:
            avg_train_loss: the average training loss on train_dataset over
                epochs.
        """
        assert epochs > 1 and isinstance(epochs, int)
        if self.log_level > 0:
            print('Training ...')
        train_loss = 0.0
        for epoch in range(epochs):
            if self.log_level > 0:
                print(' Epoch {:03d}'.format(epoch + 1))

            epoch_loss = 0.0
            self.model.train()

            for step, sample in enumerate(train_dataset):
                inputs = sample['inputs']
                labels = sample['outputs']
                self.optimizer.zero_grad()

                predictions = self.model(inputs)
                predictions = predictions.view(-1, predictions.shape[-1])
                labels = labels.view(-1)
                
                sample_loss = self.loss_function(predictions, labels)
                sample_loss.backward()
                self.optimizer.step()

                epoch_loss += sample_loss.tolist()

                if self.log_level > 1 and step % self.log_steps == self.log_steps - 1:
                    print('\t[E: {:2d} @ step {}] current avg loss = {:0.4f}'.format(epoch, step, epoch_loss / (step + 1)))
            
            avg_epoch_loss = epoch_loss / len(train_dataset)
            train_loss += avg_epoch_loss
            if self.log_level > 0:
                print('\t[E: {:2d}] train loss = {:0.4f}'.format(epoch, avg_epoch_loss))

            valid_loss = self.evaluate(valid_dataset)
            
            if self.log_level > 0:
                print('  [E: {:2d}] valid loss = {:0.4f}'.format(epoch, valid_loss))

        if self.log_level > 0:
            print('... Done!')
        
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss
    

    def evaluate(self, valid_dataset):
        """
        Args:
            valid_dataset: the dataset to use to evaluate the model.

        Returns:
            avg_valid_loss: the average validation loss over valid_dataset.
        """
        valid_loss = 0.0
        # set dropout to 0!! Needed when we are in inference mode.
        self.model.eval()
        with torch.no_grad():
            for sample in valid_dataset:
                inputs = sample['inputs']
                labels = sample['outputs']

                predictions = self.model(inputs)
                predictions = predictions.view(-1, predictions.shape[-1])
                labels = labels.view(-1)
                sample_loss = self.loss_function(predictions, labels)
                valid_loss += sample_loss.tolist()
        
        return valid_loss / len(valid_dataset)

    def predict(self, x):
        """
        Args:
            x: a tensor of indices.
        Returns: 
            A list containing the predicted POS tag for each token in the
            input sentences.
        """
        self.model.eval()
        with torch.no_grad():
            logits = self.model(x)
            predictions = torch.argmax(logits, -1)
            return logits, predictions

In [0]:
class HParams():
    vocab_size = len(vocabulary)
    hidden_dim = 256
    embedding_dim = 100
    num_classes = len(label_vocabulary) # number of different universal POS tags
    bidirectional = True
    num_layers = 2
    dropout = 0.0
    embeddings = None
params = HParams()

In [0]:
# import os
# vectors = Vectorization("glove.6B.100d.txt")
embeddings_dict = {}
with open("glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [0]:
# len(embeddings_dict['feel'])
# embeddings_dict

In [0]:
# import os
# vectors = Vectorization("glove.6B.100d.txt")
pretrained_embeddings = torch.randn(len(vocabulary), len(embeddings_dict['a']))
initialised = 0
for i, w in enumerate(vocabulary.itos):
    if w in embeddings_dict:
        initialised += 1
        vec = torch.from_numpy(embeddings_dict[w]) #vectors.get_vecs_by_tokens(w)
        pretrained_embeddings[i] = vec
    
pretrained_embeddings[vocabulary["<pad>"]] = torch.zeros(len(embeddings_dict['a']))
params.embedding_dim=len(embeddings_dict['a'])
params.embeddings = pretrained_embeddings
params.vocab_size = len(vocabulary)

In [0]:
window_size, window_shift = 100, 100
device = "cpu"
trainingset = POSTaggingDataset("train.conllu", window_size, window_shift, device=device)
vocabulary = build_vocab(trainingset, min_freq=2)
label_vocabulary = build_label_vocab(trainingset)
trainingset.index_dataset(vocabulary, label_vocabulary)

devset = POSTaggingDataset("dev.conllu", window_size, window_shift, device=device)
vocabulary = build_vocab(devset, min_freq=2)
label_vocabulary = build_label_vocab(devset)
devset.index_dataset(vocabulary, label_vocabulary)

testset = POSTaggingDataset("test.conllu", window_size, window_shift, device=device)
vocabulary = build_vocab(testset, min_freq=2)
label_vocabulary = build_label_vocab(testset)
testset.index_dataset(vocabulary, label_vocabulary)

train_dataset = DataLoader(trainingset, batch_size=128)
valid_dataset = DataLoader(devset, batch_size=128)
test_dataset = DataLoader(testset, batch_size=128)

postagger = POSTaggerModel(params)#.cuda()

100%|██████████| 16016/16016 [00:00<00:00, 47510.04it/s]
100%|██████████| 16016/16016 [00:00<00:00, 66592.82it/s]
100%|██████████| 16016/16016 [00:00<00:00, 48276.81it/s]
100%|██████████| 16016/16016 [00:00<00:00, 67327.12it/s]
100%|██████████| 16016/16016 [00:00<00:00, 47111.31it/s]
100%|██████████| 16016/16016 [00:00<00:00, 68467.75it/s]


<__main__.HParams object at 0x7fefe3e6ec50>
initializing embeddings from pretrained


In [0]:
bilstm_trainer = Trainer(
    model = postagger,
    loss_function = nn.CrossEntropyLoss(ignore_index=label_vocabulary["<pad>"]),
    optimizer = optim.Adam(postagger.parameters()),
    label_vocab=label_vocabulary
)

In [0]:
bilstm_trainer.train(train_dataset, valid_dataset, 10)

Training ...
 Epoch 001
	[E:  0] train loss = 0.9793
  [E:  0] valid loss = 0.3146
 Epoch 002
	[E:  1] train loss = 0.2720
  [E:  1] valid loss = 0.2304
 Epoch 003
	[E:  2] train loss = 0.2235
  [E:  2] valid loss = 0.1990
 Epoch 004
	[E:  3] train loss = 0.2006
  [E:  3] valid loss = 0.1836
 Epoch 005
	[E:  4] train loss = 0.1850
  [E:  4] valid loss = 0.1705
 Epoch 006
	[E:  5] train loss = 0.1730
  [E:  5] valid loss = 0.1581
 Epoch 007
	[E:  6] train loss = 0.1616
  [E:  6] valid loss = 0.1466
 Epoch 008
	[E:  7] train loss = 0.1496
  [E:  7] valid loss = 0.1342
 Epoch 009
	[E:  8] train loss = 0.1363
  [E:  8] valid loss = 0.1208
 Epoch 010
	[E:  9] train loss = 0.1213
  [E:  9] valid loss = 0.1074
... Done!


0.2602300394622107

In [0]:
torch.save(postagger.state_dict(), "postagger.pth")

In [0]:
from sklearn.metrics import precision_score as sk_precision
def compute_precision(model, l_dataset, l_label_vocab):
    all_predictions = list()
    all_labels = list()
    for indexed_elem in l_dataset:
        indexed_in = indexed_elem["inputs"]
        indexed_labels = indexed_elem["outputs"]
        predictions = model(indexed_in)
        predictions = torch.argmax(predictions, -1).view(-1)
        labels = indexed_labels.view(-1)
        valid_indices = labels != 0
        
        valid_predictions = predictions[valid_indices]
        valid_labels = labels[valid_indices]
        
        all_predictions.extend(valid_predictions.tolist())
        all_labels.extend(valid_labels.tolist())
    # global precision. Does take class imbalance into account.
    micro_precision = sk_precision(all_labels, all_predictions, average="micro", zero_division=0)
    # precision per class and arithmetic average of them. Does not take into account class imbalance.
    macro_precision = sk_precision(all_labels, all_predictions, average="macro", zero_division=0)
    per_class_precision = sk_precision(all_labels, all_predictions, labels = list(range(len(l_label_vocab))), average=None, zero_division=0)
    
    return {"micro_precision":micro_precision,
            "macro_precision":macro_precision, 
            "per_class_precision":per_class_precision}

In [0]:
precisions = compute_precision(postagger, test_dataset, label_vocabulary)
per_class_precision = precisions["per_class_precision"]
print("Micro Precision: {}\nMacro Precision: {}".format(precisions["micro_precision"], precisions["macro_precision"]))
print("Per class Precision:")
for idx_class, precision in sorted(enumerate(per_class_precision), key=lambda elem: -elem[1]):
    label = label_vocabulary.itos[idx_class]
    print(label, precision)

In [0]:
test_set_loss = bilstm_trainer.evaluate(test_dataset)
print("test set loss: {}".format(test_set_loss))

In [0]:
def print_outputs(l_trainer, l_testset, num_outputs, l_vocabulary, l_label_vocabulary):
    
    for i in range(num_outputs):
        print("sentence {}".format(i))
        print()
        test_elem = l_testset[i]

        test_x, test_y = test_elem["inputs"], test_elem["outputs"]
        
        logits, predictions = l_trainer.predict(test_x.unsqueeze(0))
        print(l_label_vocabulary.itos)
        decoded_labels = POSTaggingDataset.decode_output(logits, l_label_vocabulary)[0]
        test_y = test_y.tolist()
        print("token\t\tinput\t\tgold\t\tprediction")
        print("-"*100)
        for raw_elem, idx, label, predicted_label in zip(l_testset.get_raw_element(i), test_x.tolist(), test_y, decoded_labels):
            if idx == 0:
                break
            print("{}\t\t{}\t\t{}\t\t{}".format(raw_elem["form"], l_vocabulary.itos[idx], l_label_vocabulary.itos[label], predicted_label))
        print("="*30)

print_outputs(bilstm_trainer, testset, 10, vocabulary, label_vocabulary)

In [0]:
def print_outputs(l_trainer, l_testset, l_vocabulary, l_label_vocabulary):

    test_elem = l_testset[0]

    # test_x, test_y = test_elem["inputs"], test_elem["outputs"]
    test_x = test_elem["inputs"]
    logits, predictions = l_trainer.predict(test_x.unsqueeze(0))
        
    decoded_labels = POSTaggingDataset.decode_output(logits, l_label_vocabulary)[0]
    # test_y = test_y.tolist()
    print("token\t\tinput\t\tprediction")
    print("-"*100)
    for raw_elem, idx, predicted_label in zip(l_testset.get_raw_element(0),test_x.tolist(), decoded_labels):  #, test_y,
        if idx == 0:
            break
        print("{}\t\t{}\t\t{}".format(raw_elem["form"], l_vocabulary.itos[idx], predicted_label)) # l_label_vocabulary.itos[label],
    print("="*30)

print_outputs(bilstm_trainer, testset, vocabulary, label_vocabulary)

token		input		prediction
----------------------------------------------------------------------------------------------------
 		 		DET
however		however		CCONJ
,		,		CCONJ
on		on		CCONJ
may		may		CCONJ
8th		8th		DET
,		,		CCONJ
2010		2010		DET
,		,		CCONJ
a		a		CCONJ
sighting		sighting		DET
of		of		CCONJ
a		a		CCONJ
gray		gray		CCONJ
whale		whale		DET
was		was		CCONJ
confirmed		confirmed		CCONJ
off		off		CCONJ
the		the		CCONJ
coast		coast		CCONJ
of		of		CCONJ
israel		israel		CCONJ
in		in		CCONJ
the		the		CCONJ
mediterranean		mediterranean		CCONJ
sea		sea		CCONJ
.		.		CCONJ
,		,		CCONJ
leading		leading		CCONJ
some		some		CCONJ
scientists		scientists		CCONJ
to		to		CCONJ
think		think		CCONJ
they		they		CCONJ
might		might		CCONJ
be		be		CCONJ
repopulating		repopulating		DET
old		old		CCONJ
breeding		breeding		DET
grounds		grounds		CCONJ
that		that		CCONJ
have		have		CCONJ
not		not		CCONJ
been		been		CCONJ
used		used		DET
for		for		CCONJ
centuries		centuries		CCONJ
.		.		CCONJ


In [0]:
def print_outputs(l_trainer, l_testset, l_vocabulary, l_label_vocabulary):
    
    for i in range(len(test_str)):
        print("sentence {}".format(i))
        print()
        test_elem = l_testset[i]

        test_x, test_y = test_elem["inputs"], test_elem["outputs"]
        
        logits, predictions = l_trainer.predict(test_x.unsqueeze(0))
        
        decoded_labels = POSTaggingDataset.decode_output(logits, l_label_vocabulary)[0]
        test_y = test_y.tolist()
        print("token\t\tinput\t\tgold\t\tprediction")
        print("-"*100)
        for raw_elem, idx, label, predicted_label in zip(l_testset.get_raw_element(i), test_x.tolist(), test_y, decoded_labels):
            if idx == 0:
                break
            print("{}\t\t{}\t\t{}\t\t{}".format(raw_elem["form"], l_vocabulary.itos[idx], l_label_vocabulary.itos[label], predicted_label))
        print("="*30)

print_outputs(bilstm_trainer, testset, vocabulary, label_vocabulary)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
opponents		opponents		NOUN		CCONJ
and		and		CCONJ		CCONJ
successfully		successfully		ADV		CCONJ
prevented		prevented		VERB		CCONJ
their		their		DET		CCONJ
violent		violent		ADJ		CCONJ
measures		measures		NOUN		CCONJ
.		.		PUNCT		CCONJ
sentence 15304

token		input		gold		prediction
----------------------------------------------------------------------------------------------------
economic		economic		ADJ		CCONJ
output		output		NOUN		CCONJ
in		in		ADP		CCONJ
the		the		DET		CCONJ
gaza		gaza		PROPN		CCONJ
strip		strip		PROPN		CCONJ
declined		declined		VERB		CCONJ
by		by		ADP		CCONJ
about		about		ADV		CCONJ
one-third		one-third		NOUN		DET
between		between		ADP		CCONJ
1992		1992		NUM		CCONJ
and		and		CCONJ		CCONJ
1996		1996		NUM		DET
.		.		PUNCT		CCONJ
sentence 15305

token		input		gold		prediction
----------------------------------------------------------------------------------------------------
the		the		DET		CCONJ
esoterist

In [0]:
exp_1 = [['Pontë', 'Incumbent', 'of', 'Haworth', 'Yorkshire', '.'],
 ['Constantine', 'called', 'the', 'bishops', 'to', 'his', 'court', ',','among', 'them','Eusebius','.']]
print(len(exp_1))

2


In [0]:
[' '.join(exp) for exp in exp_1]

['Pontë Incumbent of Haworth Yorkshire .',
 'Constantine called the bishops to his court , among them Eusebius .']

In [0]:
def print_outputs(l_trainer, l_testset, l_vocabulary, l_label_vocabulary):
    arr = []
    
    inp_string = ' '.join([' '.join(s) for s in l_testset])
    spacyconll = Spacy2ConllParser()
    spacyconll.parseprint(input_str=inp_string, output_file="inp_string.conllu")
    t_testset = POSTaggingDataset("inp_string.conllu", window_size, window_shift, device=device)
    t_testset.index_dataset(l_vocabulary, l_label_vocabulary) 

    for i in range(len([' '.join(e) for e in l_testset])): 
        arri = []
        test_elem = t_testset[i]
        test_x, test_y = test_elem["inputs"], test_elem["outputs"]
        
        logits, predictions = l_trainer.predict(test_x.unsqueeze(0))
        decoded_labels = POSTaggingDataset.decode_output(logits, l_label_vocabulary)[0]

        for raw_elem, idx, label, predicted_label in zip(t_testset.get_raw_element(i), test_x.tolist(), test_y.tolist(), decoded_labels):
            if idx == 0:
                break
            arri.append(predicted_label)

        arr.append(arri)
    return arr

arr = print_outputs(bilstm_trainer, exp_1, vocabulary, label_vocabulary)
print(arr)

[[12, 2, 2, 2, 12, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]]
[['PART', 'PROPN', 'PROPN', 'PROPN', 'PART', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'P