# Allen NLP

Tutorial on how to use AllenNLP binary

In [4]:
from typing import Dict, List, Union
import logging
import json
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer
from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter

In [5]:
logger = logging.getLogger(__name__)

# Fields

A `Field` contains one piece of data for one example that is passed through your model. `Fields` get converted to tensors in a model, either as an input or an output, after being converted to IDs, batched and padded.

There are many types of fields in AllenNLP on the type of data that they represent. Among them, the most important is `TextFields`, which represents a piece of tokenized text.

Others commonly used fields include:

* `LabelField`
* `MultiLabelField`
* `SequenceLabelField`
* `SpanField`
* `ListField`
* `ArrayField`

In [6]:
from collections import Counter, defaultdict
from typing import Dict

from allennlp.data.fields import TextField, LabelField, SequenceLabelField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary

In [9]:
tokens = [Token("the"), Token('best'), Token('movie'), Token('ever'), Token('!')]
token_indexers: Dict[str, TokenIndexer]  = {'tokens': SingleIdTokenIndexer()}
text_field = TextField(tokens, token_indexers = token_indexers)

label_field = LabelField("pos")

sequence_label_field = SequenceLabelField(["DET", "ADJ", "NOUN","ADV","PUNKT"], text_field)

print(text_field)
print(label_field)
print(sequence_label_field)

TextField of length 5 with text: 
 		[the, best, movie, ever, !]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'}
LabelField with label: pos in namespace: 'labels'.
SequenceLabelField of length 5 with labels:
 		['DET', 'ADJ', 'NOUN', 'ADV', 'PUNKT']
 		in namespace: 'labels'.


# Instances

Instance is a collection of fields

In [11]:
fields: Dict[str, Field] = {
    'tokens': text_field,
    'label' : label_field
}
instance = Instance(fields)
instance.add_field('label_seq', sequence_label_field)

In [12]:
print(instance)

Instance with fields:
 	 tokens: TextField of length 5 with text: 
 		[the, best, movie, ever, !]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 label: LabelField with label: pos in namespace: 'labels'. 
 	 label_seq: SequenceLabelField of length 5 with labels:
 		['DET', 'ADJ', 'NOUN', 'ADV', 'PUNKT']
 		in namespace: 'labels'. 



In [13]:
# create a vocabulary
counter: Dict[str, Dict[str, int]] = defaultdict(Counter)
instance.count_vocab_items(counter)
vocab = Vocabulary(counter)

In [15]:
# convert all strings in all of the fields into integer IDs by calling index_fields()
instance.index_fields(vocab)

# instances know how to convert themselves into a dict of tensors.

tensors = instance.as_tensor_dict()
print(tensors)

{'tokens': {'tokens': {'tokens': tensor([2, 3, 4, 5, 6])}}, 'label': tensor(0), 'label_seq': tensor([1, 2, 3, 4, 5])}


# Example

In [17]:
review = TextField(list(map(Token, ["This","movie","was","awful","!"])), token_indexers={'tokens': SingleIdTokenIndexer()})
review_sentiment = LabelField('negative',label_namespace='tags')

# Access the original strings and labels using the methods on fields
print('Tokens in TextField: ',review.tokens)
print('Label of labelfield',review_sentiment.label)

Tokens in TextField:  [This, movie, was, awful, !]
Label of labelfield negative


Once we've made our Fields. We need to pair them together to form an `instance`

In [18]:
from allennlp.data import Instance
instance1 = Instance({'review': review, 'label': review_sentiment})

print('Fields in instances ', instance1.fields)

Fields in instances  {'review': <allennlp.data.fields.text_field.TextField object at 0x1425eb700>, 'label': <allennlp.data.fields.label_field.LabelField object at 0x1425eb2c0>}


In [None]:
tokenizer: Tokenizer = WhitespaceTokenizer()

# Represents each token with a single ID from a vocabulary.
token_indexer: TokenIndexer = SingleIdTokenIndexer(namespace="token_vocab")



### TokenIndexers

Each TokenIndexers knows how to convert a `Token` into a representation that can be encoded by a corresponding piece of the model. 

- Mapping the token into vocabulary
- breaking up the token into characters or wordpieces and representing the token by sequence of indexed characters

In [24]:
import warnings
from typing import Dict

import torch
from allennlp.data import Token, Vocabulary, TokenIndexer, Tokenizer
from allennlp.data.fields import ListField, TextField
from allennlp.data.token_indexers import (
    SingleIdTokenIndexer,
    TokenCharactersIndexer,
    ELMoTokenCharactersIndexer,
    PretrainedTransformerIndexer,
    PretrainedTransformerMismatchedIndexer,
)
from allennlp.data.tokenizers import (
    CharacterTokenizer,
    PretrainedTransformerTokenizer,
    SpacyTokenizer,
    WhitespaceTokenizer,
)
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import (
    Embedding,
    TokenCharactersEncoder,
    ElmoTokenEmbedder,
    PretrainedTransformerEmbedder,
    PretrainedTransformerMismatchedEmbedder,
)
from allennlp.nn import util as nn_util

warnings.filterwarnings("ignore")


In [25]:
tokenizer: Tokenizer = WhitespaceTokenizer()
    
token_indexer = SingleIdTokenIndexer(namespace = 'token_vocab')

vocab = Vocabulary()
vocab.add_tokens_to_namespace(
    ["This", "is", "some", "text", "."], namespace="token_vocab"
)
vocab.add_tokens_to_namespace(
    ["T", "h", "i", "s", " ", "o", "m", "e", "t", "x", "."], namespace="character_vocab"
)

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [26]:
text="this is some text ."
tokens = tokenizer.tokenize(text)
print('Word tokens ', tokens)

Word tokens  [this, is, some, text, .]


In [27]:
text_field = TextField(tokens, {"tokens": token_indexer})


In [29]:
# Using the vocabulary 
text_field.index(vocab)

In [30]:
# We typically batch things together when making tensors, which requires some
# padding computation.  Don't worry too much about the padding for now.
padding_lengths = text_field.get_padding_lengths()

In [31]:
padding_lengths

{'tokens___tokens': 5}

In [32]:
tensor_dict = text_field.as_tensor(padding_lengths)
# This output is pretty nested and might look complex.  The reason it is so
# nested is that we need to (1) align each indexer with a corresponding
# embedder in the model, and (2) pass a dictionary of arguments to the
# embedder by name.  This will be more clear when we get to the embedder.
print("With single id indexer:", tensor_dict)

With single id indexer: {'tokens': {'tokens': tensor([1, 3, 4, 5, 6])}}


In [33]:
token_indexer = TokenCharactersIndexer(namespace="character_vocab")


In [34]:
text_field = TextField(tokens, {"token_characters": token_indexer})
text_field.index(vocab)

In [35]:
padding_lengths = text_field.get_padding_lengths()


In [37]:
tensor_dict = text_field.as_tensor(padding_lengths)
print("With token characters indexer:", tensor_dict)


With token characters indexer: {'token_characters': {'token_characters': tensor([[10,  3,  4,  5],
        [ 4,  5,  0,  0],
        [ 5,  7,  8,  9],
        [10,  9, 11, 10],
        [12,  0,  0,  0]])}}


In [38]:
# Splits text into characters (instead of words or wordpieces).
tokenizer = CharacterTokenizer()

tokens = tokenizer.tokenize(text)
print("Character tokens:", tokens)

# Represents each token (which is a character) as a single id from a vocabulary.
token_indexer = SingleIdTokenIndexer(namespace="character_vocab")

text_field = TextField(tokens, {"token_characters": token_indexer})
text_field.index(vocab)

padding_lengths = text_field.get_padding_lengths()

tensor_dict = text_field.as_tensor(padding_lengths)
print("With single id indexer:", tensor_dict)

Character tokens: [t, h, i, s,  , i, s,  , s, o, m, e,  , t, e, x, t,  , .]
With single id indexer: {'token_characters': {'tokens': tensor([10,  3,  4,  5,  6,  4,  5,  6,  5,  7,  8,  9,  6, 10,  9, 11, 10,  6,
        12])}}


# combining

In [39]:
# Splits text into words (instead of wordpieces or characters).
tokenizer: Tokenizer = WhitespaceTokenizer()

# Represents each token with both an id from a vocabulary and a sequence of
# characters.
token_indexers: Dict[str, TokenIndexer] = {
    "tokens": SingleIdTokenIndexer(namespace="token_vocab"),
    "token_characters": TokenCharactersIndexer(namespace="character_vocab"),
}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(
    ["This", "is", "some", "text", "."], namespace="token_vocab"
)
vocab.add_tokens_to_namespace(
    ["T", "h", "i", "s", " ", "o", "m", "e", "t", "x", "."], namespace="character_vocab"
)


[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [40]:
text = "This is some text ."
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

Tokens: [This, is, some, text, .]


In [41]:
# The setup here is the same as what we saw above.
text_field = TextField(tokens, token_indexers)
text_field.index(vocab)
padding_lengths = text_field.get_padding_lengths()
tensor_dict = text_field.as_tensor(padding_lengths)

In [42]:
print("Combined tensor dictionary:", tensor_dict)


Combined tensor dictionary: {'tokens': {'tokens': tensor([2, 3, 4, 5, 6])}, 'token_characters': {'token_characters': tensor([[ 2,  3,  4,  5],
        [ 4,  5,  0,  0],
        [ 5,  7,  8,  9],
        [10,  9, 11, 10],
        [12,  0,  0,  0]])}}


In [43]:
# Now we split text into words with part-of-speech tags, using Spacy's POS tagger.
# This will result in the `tag_` variable being set on each `Token` object, which
# we will read in the indexer.
tokenizer = SpacyTokenizer(pos_tags=True)
vocab.add_tokens_to_namespace(["DT", "VBZ", "NN", "."], namespace="pos_tag_vocab")


[2, 3, 4, 5]

In [44]:
# Represents each token with (1) an id from a vocabulary, (2) a sequence of
# characters, and (3) part of speech tag ids.
token_indexers = {
    "tokens": SingleIdTokenIndexer(namespace="token_vocab"),
    "token_characters": TokenCharactersIndexer(namespace="character_vocab"),
    "pos_tags": SingleIdTokenIndexer(namespace="pos_tag_vocab", feature_name="tag_"),
}


In [47]:
tokens = tokenizer.tokenize(text)
print("Spacy tokens:", tokens)
print("POS tags:", [(token, token.tag_) for token in tokens])

Spacy tokens: [This, is, some, text, .]
POS tags: [(This, 'DT'), (is, 'VBZ'), (some, 'DT'), (text, 'NN'), (., '.')]


In [48]:
text_field = TextField(tokens, token_indexers)
text_field.index(vocab)

In [49]:
padding_lengths = text_field.get_padding_lengths()


In [50]:
tensor_dict = text_field.as_tensor(padding_lengths)
print("Tensor dict with POS tags:", tensor_dict)

Tensor dict with POS tags: {'tokens': {'tokens': tensor([2, 3, 4, 5, 6])}, 'token_characters': {'token_characters': tensor([[ 2,  3,  4,  5],
        [ 4,  5,  0,  0],
        [ 5,  7,  8,  9],
        [10,  9, 11, 10],
        [12,  0,  0,  0]])}, 'pos_tags': {'tokens': tensor([2, 3, 2, 4, 5])}}


# Text Field Embedders

As a reminder, there are three main steps
1. TOkenization (Text -> Tokens)
2. Representing each token as some kind of ID using TextFields and TokenIndexers
3. Embedding those IDs into vector space . TextFieldEmbedders
