In [0]:
!pip install allennlp

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My Drive/Colab Notebooks/NLP Labs/hw 5

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/NLP Labs/hw 5


In [0]:
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer

In [0]:
from allennlp.common import JsonDict
from allennlp.data import DatasetReader, Instance
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.predictors import Predictor
from overrides import overrides
from typing import List
from typing import Dict

**MODEL**

In [0]:
# Model in AllenNLP represents a model that is trained.
@Model.register("lstm_classifier")
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary,
                 positive_label: int = 4) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=vocab.get_vocab_size('labels'))

        # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive)
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(positive_label)

        # We use the cross entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(tokens)

        # Forward pass
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            self.f1_measure(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        precision, recall, f1_measure = self.f1_measure.get_metric(reset)
        return {'accuracy': self.accuracy.get_metric(reset),
                'precision': precision,
                'recall': recall,
                'f1_measure': f1_measure}

In [0]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128

reader = StanfordSentimentTreeBankDatasetReader()

**BASELINE**

In [34]:
train_dataset = reader.read('trees/train.txt')
dev_dataset = reader.read('trees/dev.txt')

vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3})

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model_class = LstmClassifier(word_embeddings, encoder, vocab)

optimizer = optim.Adam(model_class.parameters(), lr=1e-4, weight_decay=1e-5)

iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)


0it [00:00, ?it/s][A
499it [00:00, 4985.86it/s][A
1041it [00:00, 5106.56it/s][A
1615it [00:00, 5280.92it/s][A
2164it [00:00, 5341.55it/s][A
2709it [00:00, 5371.45it/s][A
3242it [00:00, 5357.64it/s][A
3788it [00:00, 5384.75it/s][A
4338it [00:00, 5418.80it/s][A
4869it [00:00, 5382.29it/s][A
5388it [00:01, 3192.03it/s][A
5954it [00:01, 3671.95it/s][A
6505it [00:01, 4079.76it/s][A
7045it [00:01, 4401.27it/s][A
7589it [00:01, 4667.88it/s][A
8166it [00:01, 4950.36it/s][A
8544it [00:01, 4856.86it/s][A
0it [00:00, ?it/s][A
542it [00:00, 5412.40it/s][A
1068it [00:00, 5363.99it/s][A
1101it [00:00, 5149.15it/s][A
  0%|          | 0/9645 [00:00<?, ?it/s][A
 55%|█████▍    | 5297/9645 [00:00<00:00, 52960.96it/s][A
100%|██████████| 9645/9645 [00:00<00:00, 54336.57it/s][A

In [9]:
%time
trainer = Trainer(model=model_class,optimizer=optimizer,iterator=iterator,train_dataset=train_dataset,validation_dataset=dev_dataset,patience=5,num_epochs=20)
metrics_class = trainer.train()

  0%|          | 0/267 [00:00<?, ?it/s]

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


accuracy: 0.2644, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5796 ||: 100%|██████████| 267/267 [00:10<00:00, 26.48it/s]
accuracy: 0.2698, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5724 ||: 100%|██████████| 35/35 [00:00<00:00, 95.76it/s]
accuracy: 0.2739, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5654 ||: 100%|██████████| 267/267 [00:09<00:00, 28.12it/s]
accuracy: 0.2743, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5718 ||: 100%|██████████| 35/35 [00:00<00:00, 105.89it/s]
accuracy: 0.2848, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5585 ||: 100%|██████████| 267/267 [00:09<00:00, 28.70it/s]
accuracy: 0.2752, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5670 ||: 100%|██████████| 35/35 [00:00<00:00, 104.96it/s]
accuracy: 0.3112, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5330 ||: 100%|██████████| 267/267 [00:09<00:00, 32.59it/s]
accuracy: 0.3206, precis

**ELMO**

In [0]:
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
from allennlp.modules.token_embedders import ElmoTokenEmbedder

In [36]:
elmo_token_indexer = ELMoTokenCharactersIndexer()
reader = StanfordSentimentTreeBankDatasetReader(
token_indexers={'tokens': elmo_token_indexer})

train_dataset = reader.read('trees/train.txt')
dev_dataset = reader.read('trees/dev.txt')

options_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
                    '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json')
weight_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo'
                   '/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5')

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3})

word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

elmo_embedding_dim = 256
lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

model_elmo = LstmClassifier(word_embeddings, lstm, vocab)
optimizer = optim.Adam(model_elmo.parameters())

iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])

iterator.index_with(vocab)



0it [00:00, ?it/s][A[A

494it [00:00, 4938.46it/s][A[A

1007it [00:00, 4992.05it/s][A[A

1541it [00:00, 5091.20it/s][A[A

2078it [00:00, 5171.11it/s][A[A

2597it [00:00, 5171.78it/s][A[A

3144it [00:00, 5255.63it/s][A[A

3668it [00:00, 5248.21it/s][A[A

4209it [00:00, 5295.61it/s][A[A

4743it [00:00, 5306.77it/s][A[A

5264it [00:01, 5276.60it/s][A[A

5830it [00:01, 5384.77it/s][A[A

6376it [00:01, 5405.58it/s][A[A

6943it [00:01, 5479.14it/s][A[A

7487it [00:01, 5465.01it/s][A[A
8031it [00:01, 2297.27it/s][A
8544it [00:02, 4212.39it/s][A
0it [00:00, ?it/s][A
547it [00:00, 5467.44it/s][A
1101it [00:00, 5608.01it/s][A
  0%|          | 0/9645 [00:00<?, ?it/s][A
100%|██████████| 9645/9645 [00:00<00:00, 142565.86it/s][A

In [37]:
%time

trainer = Trainer(model=model_elmo, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=5, num_epochs=20)
metrics_elmo = trainer.train()


  0%|          | 0/267 [00:00<?, ?it/s][A

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs



accuracy: 0.2500, precision: 0.2000, recall: 0.1250, f1_measure: 0.1538, loss: 1.6027 ||:   0%|          | 1/267 [00:03<13:30,  3.05s/it][A
accuracy: 0.1719, precision: 0.0556, recall: 0.1111, f1_measure: 0.0741, loss: 1.6079 ||:   1%|          | 2/267 [00:04<10:44,  2.43s/it][A
accuracy: 0.1771, precision: 0.0435, recall: 0.0909, f1_measure: 0.0588, loss: 1.6129 ||:   1%|          | 3/267 [00:04<07:54,  1.80s/it][A
accuracy: 0.1719, precision: 0.0435, recall: 0.0500, f1_measure: 0.0465, loss: 1.6164 ||:   1%|▏         | 4/267 [00:05<07:06,  1.62s/it][A
accuracy: 0.1750, precision: 0.0435, recall: 0.0370, f1_measure: 0.0400, loss: 1.6128 ||:   2%|▏         | 5/267 [00:06<06:46,  1.55s/it][A
accuracy: 0.1823, precision: 0.0435, recall: 0.0333, f1_measure: 0.0377, loss: 1.6067 ||:   2%|▏         | 6/267 [00:07<05:31,  1.27s/it][A
accuracy: 0.1875, precision: 0.0435, recall: 0.0286, f1_measure: 0.0345, loss: 1.5983 ||:   3%|▎         | 7/267 [00:08<04:52,  1.13s/it][A
accuracy: 0.

**BERT**

In [0]:
from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder

In [39]:
reader = StanfordSentimentTreeBankDatasetReader()

train_dataset = reader.read('trees/train.txt')
dev_dataset = reader.read('trees/dev.txt')

vocab = Vocabulary.from_instances(train_dataset + dev_dataset, min_count={'tokens': 3})

bert_embedder = PretrainedBertEmbedder(pretrained_model="bert-base-uncased", top_layer_only=True, )

word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder})

bert_embedding_dim = word_embeddings.get_output_dim()
#bert_embedding_dim = 256

lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(bert_embedding_dim, HIDDEN_DIM, batch_first=True))

model_bert = LstmClassifier(word_embeddings, lstm, vocab)

optimizer = optim.Adam(model_bert.parameters(), lr=1e-4, weight_decay=1e-5)

iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")])
iterator.index_with(vocab)


0it [00:00, ?it/s][A
435it [00:00, 4346.79it/s][A
751it [00:00, 1406.05it/s][A
1201it [00:00, 1771.20it/s][A
1697it [00:00, 2194.10it/s][A
2168it [00:00, 2612.29it/s][A
2648it [00:01, 3025.25it/s][A
3123it [00:01, 3393.59it/s][A
3605it [00:01, 3722.64it/s][A
4105it [00:01, 4029.26it/s][A
4592it [00:01, 4247.15it/s][A
5078it [00:01, 4413.66it/s][A
5590it [00:01, 4602.32it/s][A
6087it [00:01, 4706.31it/s][A
6576it [00:01, 4756.93it/s][A
7071it [00:01, 4810.90it/s][A
7561it [00:02, 4778.94it/s][A
8045it [00:02, 4731.79it/s][A
8544it [00:02, 3774.01it/s][A
0it [00:00, ?it/s][A
451it [00:00, 4508.24it/s][A
925it [00:00, 4574.31it/s][A
1101it [00:00, 4520.73it/s][A
  0%|          | 0/9645 [00:00<?, ?it/s][A
 48%|████▊     | 4661/9645 [00:00<00:00, 46609.82it/s][A
100%|██████████| 9645/9645 [00:00<00:00, 48135.47it/s][A

In [40]:
%time

trainer = Trainer(model=model_bert, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, patience=5, num_epochs=20)
metrics_bert = trainer.train()


  0%|          | 0/267 [00:00<?, ?it/s][A

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs



accuracy: 0.3125, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5844 ||:   0%|          | 1/267 [00:03<15:02,  3.39s/it][A
accuracy: 0.2500, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.6152 ||:   1%|          | 2/267 [00:04<11:53,  2.69s/it][A
accuracy: 0.2708, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.6007 ||:   1%|          | 3/267 [00:07<12:25,  2.82s/it][A
accuracy: 0.2812, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5950 ||:   1%|▏         | 4/267 [00:10<12:32,  2.86s/it][A
accuracy: 0.2750, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5925 ||:   2%|▏         | 5/267 [00:12<10:56,  2.50s/it][A
accuracy: 0.2656, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5974 ||:   2%|▏         | 6/267 [00:13<09:08,  2.10s/it][A
accuracy: 0.2723, precision: 0.0000, recall: 0.0000, f1_measure: 0.0000, loss: 1.5935 ||:   3%|▎         | 7/267 [00:16<10:03,  2.32s/it][A
accuracy: 0.

**PREDICT**

In [0]:
@Predictor.register("sentence_classifier_predictor")
class SentenceClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence" : sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance([str(t) for t in tokens])

In [0]:
@Predictor.register("universal_pos_predictor")
class UniversalPOSPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)

    def predict(self, words: List[str]) -> JsonDict:
        return self.predict_json({"words" : words})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        words = json_dict["words"]
        return self._dataset_reader.text_to_instance(words, words)

In [60]:
predictor_bert = SentenceClassifierPredictor(model_bert, dataset_reader=reader)
logits = predictor_bert.predict('This is the best movie ever!')['logits']
bert_label_id = np.argmax(logits)


print(model_bert.vocab.get_token_from_index(bert_label_id, 'labels'))

4


REFERENCE

https://github.com/mhagiwara/realworldnlp