In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My Drive/Colab Notebooks/NLP Labs/hw 04/

In [0]:
import sys
sys.path.insert(0, '/content/gdrive/My Drive/Colab Notebooks/NLP Labs/hw 04/')

In [0]:
!pip install allennlp

In [0]:
#In AllenNLP we use type annotations for just about everything
from typing import Iterator, List, Dict

#AllenNLP is built on top of PyTorch, so we use its code freely
import torch
import torch.optim as optim
import numpy as np

#In AllenNLP we represent each training example as an Instance containing Fields of various types.
#Here each example will have a TextField containing the sentence, and a SequenceLabelField containing the corresponding part-of-speech tags.
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField

#Typically to solve a problem like this using AllenNLP, you'll have to implement two classes.
#The first is a DatasetReader, which contains the logic for reading a file of data and producing a stream of Instances.
from allennlp.data.dataset_readers import DatasetReader

#Frequently we'll want to load datasets or models from URLs
#The cached_path helper downloads such files, caches them locally, and returns the local path.
#It also accepts local file paths (which it just returns as-is).
from allennlp.common.file_utils import cached_path

#There are various ways to represent a word as one or more indices.
#For example, you might maintain a vocabulary of unique words and give each word a corresponding id. 
#Or you might have one id per character in the word and represent each word as a sequence of ids.
#AllenNLP uses a has a TokenIndexer abstraction for this representation.
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token

#Whereas a TokenIndexer represents a rule for how to turn a token into indices
#a Vocabulary contains the corresponding mappings from strings to integers.
from allennlp.data.vocabulary import Vocabulary

#Besides DatasetReader, the other class you'll typically need to implement is Model,
#which is a PyTorch Module that takes tensor inputs and produces a dict of tensor outputs
from allennlp.models import Model

#layers
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

#track accuracy
from allennlp.training.metrics import CategoricalAccuracy

#In our training we'll need a DataIterators that can intelligently batch our data.
from allennlp.data.iterators import BucketIterator

#And we'll use AllenNLP's full-featured Trainer
from allennlp.training.trainer import Trainer

#make predictions on new inputs
from allennlp.predictors import SentenceTaggerPredictor

torch.manual_seed(1)

<torch._C.Generator at 0x7fa9753a2870>

In [0]:
#implementing the datareader subclass

class PosDatasetReader(DatasetReader):
  """
  DatasetReader for PoS tagging data, one sentence per line, like
  The###DET dog###NN ate###V the###DET apple###NN
  """
  
  
  '''The only parameter our DatasetReader needs is a dict of TokenIndexers that specify how to convert tokens into indices.
  By default we'll just generate a single index for each token (which we'll call "tokens") that's just a unique id for each distinct token.
  (This is just the standard "word to index" mapping you'd use in most NLP tasks.)'''
  
  def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
        
        
  '''DatasetReader.text_to_instance takes the inputs corresponding to a training example (in this case the tokens of the sentence and the corresponding part-of-speech tags),
  instantiates the corresponding Fields (in this case a TextField for the sentence and a SequenceLabelField for its tags),
  and returns the Instance containing those fields. Notice that the tags are optional,
  since we'd like to be able to create instances from unlabeled data to make predictions on them.'''
  
  def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"sentence": sentence_field}

        if tags:
            label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field)
            fields["labels"] = label_field

        return Instance(fields)
      
      
  
  '''The other piece we have to implement is _read, which takes a filename and produces a stream of Instances.
  Most of the work has already been done in text_to_instance.'''
  
  
  def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as f:
            for line in f:
                pairs = line.strip().split()
                sentence, tags = zip(*(pair.split("###") for pair in pairs))
                yield self.text_to_instance([Token(word) for word in sentence], tags)
    

In [0]:
class LstmTagger(Model):
  
  '''One thing that might seem unusual is that we're going pass in the embedder and the sequence encoder as constructor parameters.
  This allows us to experiment with different embedders and encoders without having to change the model code.'''
  
  def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2SeqEncoder, vocab: Vocabulary) -> None:
    super().__init__(vocab)
    self.word_embeddings = word_embeddings
    self.encoder = encoder
    self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels'))
    self.accuracy = CategoricalAccuracy()
    
    
    
  '''implement forward, which is where the actual computation happens.
  Each Instance in your dataset will get (batched with other instances and) fed into forward.
  The forward method expects dicts of tensors as input, and it expects their names to be the names of the fields in your Instance.
  In this case we have a sentence field and (possibly) a labels field, so we'll construct our forward accordingly:'''
  
  def forward(self, sentence: Dict[str, torch.Tensor], labels: torch.Tensor = None) -> Dict[str, torch.Tensor]:
    mask = get_text_field_mask(sentence)
    embeddings = self.word_embeddings(sentence)
    encoder_out = self.encoder(embeddings, mask)
    tag_logits = self.hidden2tag(encoder_out)
    output = {"tag_logits": tag_logits}
    if labels is not None:
      self.accuracy(tag_logits, labels, mask)
      output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)
    return output
  
  
  #accuracy matrix gets updated on each forward pass
  def get_metrics(self, reset: bool = False) -> Dict[str, float]:
    return {"accuracy": self.accuracy.get_metric(reset)}

In [0]:
#an instance of our dataset reader.

reader = PosDatasetReader()

In [0]:
# read in the training data and validation data

train_dataset = reader.read(cached_path('https://raw.githubusercontent.com/allenai/allennlp/master/tutorials/tagger/training.txt'))
validation_dataset = reader.read(cached_path('https://raw.githubusercontent.com/allenai/allennlp/master/tutorials/tagger/validation.txt'))

2it [00:00, 2781.37it/s]
2it [00:00, 2525.17it/s]


In [0]:
#Once we've read in the datasets, we use them to create our Vocabulary

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

100%|██████████| 4/4 [00:00<00:00, 8919.31it/s]


In [0]:
#construct the model. We'll choose a size for our embedding layer and for the hidden layer of our LSTM

EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [0]:
#For embedding the tokens we'll just use the BasicTextFieldEmbedder which takes a mapping from index names to embeddings
#EMBEDDING_DIM parameter to specify the output dimension

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [0]:
#specify the sequence encoder, AllenNLP we do everything batch first, so we specify that as well

lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [0]:
#creating an instance of the model

model = LstmTagger(word_embeddings, lstm, vocab)

In [0]:
if torch.cuda.is_available():
  cuda_device = 0
  model = model.cuda(cuda_device)
else:
  cuda_device = -1

In [0]:
#optimiser
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [0]:
iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])

In [0]:
iterator.index_with(vocab)

In [0]:
trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, patience=10, num_epochs=1000, cuda_device=cuda_device)

In [0]:
trainer.train()

accuracy: 0.2222, loss: 1.1758 ||: 100%|██████████| 1/1 [00:00<00:00,  9.04it/s]
accuracy: 0.2222, loss: 1.1670 ||: 100%|██████████| 1/1 [00:00<00:00, 165.31it/s]
accuracy: 0.3333, loss: 1.1674 ||: 100%|██████████| 1/1 [00:00<00:00, 101.74it/s]
accuracy: 0.3333, loss: 1.1591 ||: 100%|██████████| 1/1 [00:00<00:00, 219.06it/s]
accuracy: 0.3333, loss: 1.1595 ||: 100%|██████████| 1/1 [00:00<00:00, 112.49it/s]
accuracy: 0.3333, loss: 1.1517 ||: 100%|██████████| 1/1 [00:00<00:00, 239.78it/s]
accuracy: 0.3333, loss: 1.1521 ||: 100%|██████████| 1/1 [00:00<00:00, 138.39it/s]
accuracy: 0.3333, loss: 1.1448 ||: 100%|██████████| 1/1 [00:00<00:00, 273.53it/s]
accuracy: 0.3333, loss: 1.1453 ||: 100%|██████████| 1/1 [00:00<00:00, 120.97it/s]
accuracy: 0.3333, loss: 1.1384 ||: 100%|██████████| 1/1 [00:00<00:00, 194.41it/s]
accuracy: 0.3333, loss: 1.1388 ||: 100%|██████████| 1/1 [00:00<00:00, 126.02it/s]
accuracy: 0.3333, loss: 1.1324 ||: 100%|██████████| 1/1 [00:00<00:00, 181.34it/s]
accuracy: 0.3333,

{'best_epoch': 999,
 'best_validation_accuracy': 1.0,
 'best_validation_loss': 0.013995885848999023,
 'epoch': 999,
 'peak_cpu_memory_MB': 1954.616,
 'peak_gpu_0_memory_MB': 322,
 'training_accuracy': 1.0,
 'training_cpu_memory_MB': 1954.616,
 'training_duration': '00:01:24',
 'training_epochs': 999,
 'training_gpu_0_memory_MB': 322,
 'training_loss': 0.013832819648087025,
 'training_start_epoch': 0,
 'validation_accuracy': 1.0,
 'validation_loss': 0.013995885848999023}

In [0]:
#creating an instance of predictor
#llenNLP contains a Predictor abstraction that takes inputs, converts them to instances,
#feeds them through your model, and returns JSON-serializable results

predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

In [0]:
#tag_logits will be a (5, 3) array of logits, corresponding to the 3 possible tags for each of the 5 words

tag_logits = predictor.predict("The dog ate the apple")['tag_logits']

In [0]:
#just get the maximum from results

tag_ids = np.argmax(tag_logits, axis=-1)

In [0]:
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

['DET', 'NN', 'V', 'DET', 'NN']


In [0]:
# Here's how to save the model.
with open("model.th", 'wb') as f:
  torch.save(model.state_dict(), f)

In [0]:
#save the vocab

vocab.save_to_files("vocabulary")

In [0]:
# And here's how to reload the model.
vocab2 = Vocabulary.from_files("vocabulary")

In [0]:
model2 = LstmTagger(word_embeddings, lstm, vocab2)

In [0]:
with open("model.th", 'rb') as f:
  model2.load_state_dict(torch.load(f))

In [0]:
if cuda_device > -1:
  model2.cuda(cuda_device)

In [0]:
predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)

tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']

np.testing.assert_array_almost_equal(tag_logits2, tag_logits)

In [0]:
tag_ids2 = np.argmax(tag_logits2, axis=-1)

In [0]:
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids2])

['DET', 'NN', 'V', 'DET', 'NN']
