In [None]:
# install allennlp packages
# ! pip install allennlp==2.5.0
# ! pip install allennlp-models==2.5.0

In [1]:
from itertools import chain
from typing import Dict
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training import GradientDescentTrainer
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import StanfordSentimentTreeBankDatasetReader


In [2]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

In [3]:
reader = StanfordSentimentTreeBankDatasetReader()
train_path = "https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt"
dev_path = "https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt"

In [4]:
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)

loading instances: 0it [00:00, ?it/s]

loading instances: 0it [00:00, ?it/s]

In [5]:
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()), 
                                  min_count={"tokens":3})

building vocab: 0it [00:00, ?it/s]

In [6]:
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)

In [7]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size("tokens"), 
                           embedding_dim=EMBEDDING_DIM)

In [8]:
word_embeddings = BasicTextFieldEmbedder({"tokens":token_embedding})

In [9]:
encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [10]:
class LstmClassifier(Model):
    def __init__(self, 
                 word_embeddings: TextFieldEmbedder, 
                encoder: Seq2VecEncoder, 
                vocab: Vocabulary, 
                positive_label: str = '4') -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(), 
                                     out_features=vocab.get_vocab_size("labels"))
        self.loss_function = torch.nn.CrossEntropyLoss()
        
        positive_index = vocab.get_token_index(positive_label, namespace="labels")
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(positive_index)
        
    def forward(self, 
                tokens: Dict[str, torch.Tensor], 
                label: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)
        
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            self.f1_measure(logits, label)
            output["loss"] = self.loss_function(logits, label)
            
        return output

In [11]:
model = LstmClassifier(word_embeddings, encoder, vocab)

In [12]:
optimizer = optim.Adam(model.parameters())

In [13]:
trainer = GradientDescentTrainer(model = model, 
                                optimizer=optimizer, 
                                data_loader=train_data_loader, 
                                validation_data_loader=dev_data_loader, 
                                patience=10, 
                                num_epochs=20, 
                                cuda_device=-1)

In [14]:
trainer.train()

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

{'best_epoch': 1,
 'peak_worker_0_memory_MB': 1288.57421875,
 'peak_gpu_0_memory_MB': 0,
 'training_duration': '0:01:11.894417',
 'epoch': 11,
 'training_loss': 0.15915893532391567,
 'training_worker_0_memory_MB': 1288.57421875,
 'training_gpu_0_memory_MB': 0.0,
 'validation_loss': 3.860519061769758,
 'best_validation_loss': 1.380574928011213}

In [15]:
from allennlp.predictors import Predictor
from allennlp.data import DatasetReader, Instance
from allennlp.data.tokenizers import SpacyTokenizer
from allennlp.common import JsonDict
from overrides import overrides

In [16]:
@Predictor.register("sentence_classifier_predictor")
class SentenceClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = dataset_reader._tokenizer or SpacyTokenizer()
        
    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})
    
    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance([str(t) for t in tokens])

In [20]:
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict("Great movie!")["logits"]
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, "labels"))

4
