In [1]:
from typing import Iterator, List, Dict

import os, shutil
import torch
import torch.optim as optim
import numpy as np

from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField

from allennlp.data.dataset_readers import DatasetReader

from allennlp.common.file_utils import cached_path

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token

from allennlp.data.vocabulary import Vocabulary

from allennlp.models import Model

from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

from allennlp.training.metrics import CategoricalAccuracy

from allennlp.data.iterators import BucketIterator

from allennlp.training.trainer import Trainer

from allennlp.predictors import SentenceTaggerPredictor

torch.manual_seed(1)

<torch._C.Generator at 0x7fbe5006e550>

# 数据集

In [3]:
class PosDatasetReader(DatasetReader):
    """
    DatasetReader for PoS tagging data, one sentence per line, like

        The###DET dog###NN ate###V the###DET apple###NN
    """

    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance:
        # TODO TextField
        sentence_field = TextField(tokens, self.token_indexers)
        print(sentence_field)
        fields = {"sentence": sentence_field}

        if tags:
            # TODO SequenceLabelField
            label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field)
            print(label_field)
            fields["labels"] = label_field

        return Instance(fields)

    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as f:
            for line in f:
                pairs = line.strip().split()
                sentence, tags = zip(*(pair.split("###") for pair in pairs))
                print(sentence, tags)
                yield self.text_to_instance([Token(word) for word in sentence], tags)

In [4]:
!ls

LSTM_Part-of-Speech_Tagging.ipynb  reading_comprehension  tmp
data				   sentiment_analysis	  walk_through_allennlp


In [5]:
!tail data/training.txt data/validation.txt

==> data/training.txt <==
The###DET dog###NN ate###V the###DET apple###NN
Everybody###NN read###V that###DET book###NN

==> data/validation.txt <==
The###DET dog###NN read###V the###DET apple###NN
Everybody###NN ate###V that###DET book###NN


In [4]:
reader = PosDatasetReader()

train_dataset = reader.read(cached_path('data/training.txt'))
validation_dataset = reader.read(cached_path('data/validation.txt'))

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

2it [00:00, 1375.86it/s]
2it [00:00, 994.26it/s]
100%|██████████| 4/4 [00:00<00:00, 30783.88it/s]

('The', 'dog', 'ate', 'the', 'apple') ('DET', 'NN', 'V', 'DET', 'NN')
TextField of length 5 with text: 
 		[The, dog, ate, the, apple]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
SequenceLabelField of length 5 with labels:
 		('DET', 'NN', 'V', 'DET', 'NN')
 		in namespace: 'labels'.
('Everybody', 'read', 'that', 'book') ('NN', 'V', 'DET', 'NN')
TextField of length 4 with text: 
 		[Everybody, read, that, book]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
SequenceLabelField of length 4 with labels:
 		('NN', 'V', 'DET', 'NN')
 		in namespace: 'labels'.
('The', 'dog', 'read', 'the', 'apple') ('DET', 'NN', 'V', 'DET', 'NN')
TextField of length 5 with text: 
 		[The, dog, read, the, apple]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
SequenceLabelField of length 5 with labels:
 		('DET', 'NN', 'V', 'DET', 'NN')
 		in namespace: 'labels'.
('Everybody', 'ate', 'that', 'book') ('NN', 'V', 'DET', 'NN')
TextField of length 4 with text: 
 		[Everybody, at




In [14]:
train_dataset2 = reader.read(cached_path('data/training.txt'))
for ins in train_dataset2:
    print(ins)
    ins.index_fields(vocab)
    print(ins.get_padding_lengths())


2it [00:00, 730.59it/s]

('The', 'dog', 'ate', 'the', 'apple') ('DET', 'NN', 'V', 'DET', 'NN')
TextField of length 5 with text: 
 		[The, dog, ate, the, apple]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
SequenceLabelField of length 5 with labels:
 		('DET', 'NN', 'V', 'DET', 'NN')
 		in namespace: 'labels'.
('Everybody', 'read', 'that', 'book') ('NN', 'V', 'DET', 'NN')
TextField of length 4 with text: 
 		[Everybody, read, that, book]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
SequenceLabelField of length 4 with labels:
 		('NN', 'V', 'DET', 'NN')
 		in namespace: 'labels'.
Instance with fields:
 	 sentence: TextField of length 5 with text: 
 		[The, dog, ate, the, apple]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 labels: SequenceLabelField of length 5 with labels:
 		('DET', 'NN', 'V', 'DET', 'NN')
 		in namespace: 'labels'. 

{'sentence': {'tokens_length': 5, 'num_tokens': 5}, 'labels': {'num_tokens': 5}}
Instance with fields:
 	 sentence: TextField of length 




In [7]:
vocab.print_statistics()



----Vocabulary Statistics----


Top 10 most frequent tokens in namespace 'tokens':
	Token: The		Frequency: 2
	Token: dog		Frequency: 2
	Token: ate		Frequency: 2
	Token: the		Frequency: 2
	Token: apple		Frequency: 2
	Token: Everybody		Frequency: 2
	Token: read		Frequency: 2
	Token: that		Frequency: 2
	Token: book		Frequency: 2

Top 10 longest tokens in namespace 'tokens':
	Token: Everybody		length: 9	Frequency: 2
	Token: apple		length: 5	Frequency: 2
	Token: read		length: 4	Frequency: 2
	Token: that		length: 4	Frequency: 2
	Token: book		length: 4	Frequency: 2
	Token: The		length: 3	Frequency: 2
	Token: dog		length: 3	Frequency: 2
	Token: ate		length: 3	Frequency: 2
	Token: the		length: 3	Frequency: 2

Top 10 shortest tokens in namespace 'tokens':
	Token: the		length: 3	Frequency: 2
	Token: ate		length: 3	Frequency: 2
	Token: dog		length: 3	Frequency: 2
	Token: The		length: 3	Frequency: 2
	Token: book		length: 4	Frequency: 2
	Token: that		length: 4	Frequency: 2
	Token: read		length: 4	

In [8]:
for i in range(vocab.get_vocab_size()):
    print(vocab.get_token_from_index(i), end='; ')

@@PADDING@@; @@UNKNOWN@@; The; dog; ate; the; apple; Everybody; read; that; book; 

In [9]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)

In [10]:
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
word_embeddings

BasicTextFieldEmbedder(
  (token_embedder_tokens): Embedding()
)

# 模型

In [11]:
class LstmTagger(Model):

    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:

        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder

        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))

        self.accuracy = CategoricalAccuracy()

    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> Dict[str, torch.Tensor]:

        print("sentence:", sentence, "labels:", labels)
        mask = get_text_field_mask(sentence)

        embeddings = self.word_embeddings(sentence)
        print("embeddings: len=", len(embeddings), embeddings)

        encoder_out = self.encoder(embeddings, mask)
        print("encoder_out: len=", len(encoder_out), encoder_out)

        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}

        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [12]:
for i in range(vocab.get_vocab_size()):
    print(vocab.get_token_from_index(i), end='; ')

@@PADDING@@; @@UNKNOWN@@; The; dog; ate; the; apple; Everybody; read; that; book; 

In [13]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = LstmTagger(word_embeddings, lstm, vocab)

if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1

optimizer = optim.SGD(model.parameters(), lr=0.1)

iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=1,
                  num_epochs=1,  # 本文主要是学习代码, 不是训练
                  cuda_device=cuda_device)

trainer.train()

accuracy: 0.3333, loss: 1.1685 ||: 100%|██████████| 1/1 [00:00<00:00,  8.65it/s]
accuracy: 0.3333, loss: 1.1592 ||: 100%|██████████| 1/1 [00:00<00:00, 74.04it/s]

sentence: {'tokens': tensor([[ 7,  8,  9, 10,  0],
        [ 2,  3,  4,  5,  6]], device='cuda:0')} labels: tensor([[0, 2, 1, 0, 0],
        [1, 0, 2, 1, 0]], device='cuda:0')
embeddings: len= 2 tensor([[[ 0.4455,  0.0070, -0.3130,  0.3054, -0.3154,  0.1747],
         [-0.1715, -0.0651, -0.5711, -0.2833,  0.3224, -0.1444],
         [ 0.5917,  0.4762, -0.0278, -0.3965,  0.3618,  0.1844],
         [-0.3840,  0.3859,  0.3607,  0.5269, -0.3330, -0.0978],
         [ 0.3061, -0.2622, -0.1152,  0.2788, -0.5593,  0.3563]],

        [[ 0.2170, -0.2315, -0.0433, -0.0535,  0.0861, -0.0024],
         [ 0.5193,  0.1849, -0.2212, -0.3588, -0.0996, -0.2563],
         [-0.1904,  0.0284,  0.3542,  0.3229, -0.5808,  0.3683],
         [ 0.1660,  0.5635,  0.3921, -0.5413, -0.5649, -0.2865],
         [ 0.5217, -0.0990,  0.2542, -0.2761,  0.5829, -0.2514]]],
       device='cuda:0', grad_fn=<CatBackward>)
encoder_out: len= 2 tensor([[[-0.1066,  0.0424,  0.0585, -0.1004,  0.0489,  0.0285],
         [-0.1148, 




{'best_epoch': 0,
 'peak_cpu_memory_MB': 2608.14,
 'peak_gpu_0_memory_MB': 589,
 'training_duration': '0:00:00.231348',
 'training_start_epoch': 0,
 'training_epochs': 0,
 'epoch': 0,
 'training_accuracy': 0.3333333333333333,
 'training_loss': 1.1684999465942383,
 'training_cpu_memory_MB': 2608.14,
 'training_gpu_0_memory_MB': 589,
 'validation_accuracy': 0.3333333333333333,
 'validation_loss': 1.1591815948486328,
 'best_validation_accuracy': 0.3333333333333333,
 'best_validation_loss': 1.1591815948486328}

In [14]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

tag_logits = predictor.predict("The dog ate the apple")['tag_logits']

print(tag_logits)
tag_ids = np.argmax(tag_logits, axis=-1)

print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

TextField of length 5 with text: 
 		[The, dog, ate, the, apple]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
sentence: {'tokens': tensor([[2, 3, 4, 5, 6]], device='cuda:0')} labels: None
embeddings: len= 1 tensor([[[ 0.2170, -0.2313, -0.0436, -0.0535,  0.0858, -0.0027],
         [ 0.5195,  0.1851, -0.2213, -0.3589, -0.0992, -0.2564],
         [-0.1904,  0.0281,  0.3543,  0.3230, -0.5807,  0.3687],
         [ 0.1660,  0.5637,  0.3919, -0.5414, -0.5650, -0.2869],
         [ 0.5220, -0.0986,  0.2541, -0.2761,  0.5832, -0.2516]]],
       device='cuda:0')
encoder_out: len= 1 tensor([[[-0.1049,  0.0474,  0.0080, -0.0844, -0.0095,  0.0397],
         [-0.2196,  0.0285,  0.1005, -0.1411,  0.0563,  0.0048],
         [-0.1389,  0.0706,  0.1534, -0.0566,  0.0330,  0.0947],
         [-0.2420,  0.0060,  0.2846, -0.0533,  0.0733,  0.0151],
         [-0.2939,  0.0692,  0.1309, -0.1365,  0.0416,  0.0565]]],
       device='cuda:0')
[[-0.30905815958976746, 0.2984865605831146, 0.023448895663

In [15]:
store_dir = '/notebook/LSTM_Part-of-Speech_Tagging/'
vocabulary_dir = os.path.join(store_dir, 'vocabulary')

if os.path.exists(vocabulary_dir):
    shutil.rmtree(vocabulary_dir)

vocab.save_to_files(vocabulary_dir)

# Here's how to save the model.
with open(os.path.join(store_dir, 'model.th'), 'wb') as f:
    torch.save(model.state_dict(), f)
    
# And here's how to reload the model.
vocab2 = Vocabulary.from_files(vocabulary_dir)

model2 = LstmTagger(word_embeddings, lstm, vocab2)

with open(os.path.join(store_dir, 'model.th'), 'rb') as f:
    model2.load_state_dict(torch.load(f))

if cuda_device > -1:
    model2.cuda(cuda_device)

predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']

print(tag_logits2)
tag_ids2 = np.argmax(tag_logits2, axis=-1)
print(tag_ids2)
np.testing.assert_array_almost_equal(tag_logits2, tag_logits)

TextField of length 5 with text: 
 		[The, dog, ate, the, apple]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
sentence: {'tokens': tensor([[2, 3, 4, 5, 6]], device='cuda:0')} labels: None
embeddings: len= 1 tensor([[[ 0.2170, -0.2313, -0.0436, -0.0535,  0.0858, -0.0027],
         [ 0.5195,  0.1851, -0.2213, -0.3589, -0.0992, -0.2564],
         [-0.1904,  0.0281,  0.3543,  0.3230, -0.5807,  0.3687],
         [ 0.1660,  0.5637,  0.3919, -0.5414, -0.5650, -0.2869],
         [ 0.5220, -0.0986,  0.2541, -0.2761,  0.5832, -0.2516]]],
       device='cuda:0')
encoder_out: len= 1 tensor([[[-0.1049,  0.0474,  0.0080, -0.0844, -0.0095,  0.0397],
         [-0.2196,  0.0285,  0.1005, -0.1411,  0.0563,  0.0048],
         [-0.1389,  0.0706,  0.1534, -0.0566,  0.0330,  0.0947],
         [-0.2420,  0.0060,  0.2846, -0.0533,  0.0733,  0.0151],
         [-0.2939,  0.0692,  0.1309, -0.1365,  0.0416,  0.0565]]],
       device='cuda:0')
[[-0.30905815958976746, 0.2984865605831146, 0.023448895663