In [5]:
# load them both using from_pretrained function
# the opposite function is the save_pretrained function, which saves the model and tokenizer to disk
# the tokenizer is used to convert the text into tokens, which are then fed into the model
# the model is the actual neural network that will be trained

from transformers import AutoTokenizer, AutoModel
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 7.98MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 9.70MB/s]


A tokenizer applies the following step:
1. Preprocesses the text and tokenizes it in subwords
2. Associates to every subword an input_id with is used to fetch its embedding in the embedding layer
3. Adds attention_mask and token_type_ids

In [10]:
tokenizer.tokenize("I love Luna", add_special_tokens=True)
# ['[CLS]', 'i', 'love', 'luna', '[SEP]']
tokenizer("I love Luna")

{'input_ids': [101, 1045, 2293, 12909, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [11]:
# some other functions that the tokenizer provides
# convert tokens to ids and ids to tokens

output = tokenizer("I love Luna")
tokenizer.convert_ids_to_tokens(output["input_ids"])

['[CLS]', 'i', 'love', 'luna', '[SEP]']

In [12]:
# all sentences are reconstructed using the tokenizer.decode function

output = tokenizer("I love Luna")
tokenizer.decode(output["input_ids"])

'[CLS] i love luna [SEP]'

In [14]:
# attention masks are used to tell the model which tokens to pay attention to and which to ignore
# the attention mask is a binary tensor that is the same size as the tokenized input
# the attention mask has a 1 for all the tokens that are not masked and a 0 for all the tokens that are masked
sentences = ["I love Luna", "I love Marco", "I love Giove"]
output = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
print(output["input_ids"])
print(output["attention_mask"])

# commenting the result of input_ids and attention_mask
# tensor([[ 101, 1045, 2293, 5959,  102],
#         [ 101, 1045, 2293, 6207,  102],
#         [ 101, 1045, 2293,  102,    0]])
# means pay attention to the words that are non zero, for the first sentence we add a PAD token at the end, that's why we have a 0 at the end of the first sentence
# tensor([[1, 1, 1, 1, 1],
#         [1, 1, 1, 1, 1],
#         [1, 1, 1, 0, 0]])
# means pay attention to all the words in the first two sentences, but only to the first three words in the third sentence


tensor([[  101,  1045,  2293, 12909,   102,     0],
        [  101,  1045,  2293,  8879,   102,     0],
        [  101,  1045,  2293, 21025, 21818,   102]])
tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1]])


The input_ids 

Each subword is associated to a input_id which tells the model which embedding to get for that subword

The attention_mask

Suppose we have 2 sentences of different lengths
if the sentences are in the same batch, the shortest one needs to be padded: we need to append [pad] tokens to the shortest sentence so that they have the same length
Tokenizer handles all of this for us ( even to not pay attention ove the PAD tokens)

The token_type_ids
Input embeddings to a transformers are the result of a sum of three elements:
 1. token embeddings: the embeddings that are extracted from the embedding matrix using input_ids
 2. positional embeddings: this are sinusoidal or learned and give the tranformer the position information
 tells the tranformer in which position each subword is associated to.
 3. Segment embeddings: when we are doing sentence-pair task
 when the input is made of 2 sentence pairs; by adding segment embedding we want to tell for every subword its originating sentence  

In [16]:
output = tokenizer("The sun is shining today","Today it's rainy")
output
tokenizer.decode(output["input_ids"])   

"[CLS] the sun is shining today [SEP] today it's rainy [SEP]"

BERT Architecture

In [None]:
"""
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
)
"""

Feeding a batch to a tranformer


In [19]:
sequences = ["Using tranformers is quite simple", "Natural Langugage Processing is the coolest area of AI", "BERT is an encoder-only model"]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(batch["input_ids"], batch["attention_mask"],batch["token_type_ids"], sep="\n\n", end="\n\n")
print(tokenizer.batch_decode(batch["input_ids"]))

tensor([[  101,  2478, 25283, 14192,  2545,  2003,  3243,  3722,   102,     0,
             0,     0,     0,     0],
        [  101,  3019, 11374, 16377,  3351,  6364,  2003,  1996,  4658,  4355,
          2181,  1997,  9932,   102],
        [  101, 14324,  2003,  2019,  4372, 16044,  2099,  1011,  2069,  2944,
           102,     0,     0,     0]])

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

['[CLS] using tranformers is quite simple [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]', '[CLS] natural langugage processing is the coolest area of ai [SEP]', '[CLS] bert is an encoder - only model [SEP] [PAD] [PAD] [PAD]']


Task: given two sentences, assign positive class(1) if the two sentences are paraphrases of one another (assign 0 otherwise)

In [20]:
from datasets import load_dataset
mrpc_dataset = load_dataset("glue", "mrpc")
mrpc_dataset

Downloading builder script: 100%|██████████| 28.8k/28.8k [00:00<00:00, 20.7MB/s]
Downloading metadata: 100%|██████████| 28.7k/28.7k [00:00<00:00, 28.7MB/s]
Downloading readme: 100%|██████████| 27.9k/27.9k [00:00<00:00, 27.9MB/s]


Downloading and preparing dataset glue/mrpc to C:/Users/roven/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data: 6.22kB [00:00, 3.10MB/s]/3 [00:00<?, ?it/s]
Downloading data: 1.05MB [00:00, 20.8MB/s]/3 [00:00<00:00,  3.42it/s]
Downloading data: 441kB [00:00, 16.3MB/s]2/3 [00:00<00:00,  3.79it/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  2.14it/s]
                                                                                     

Dataset glue downloaded and prepared to C:/Users/roven/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 33.59it/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

Check the dataset features and examples

In [21]:
print(mrpc_dataset["train"].features)
print(mrpc_dataset["train"][0], end = "\n\n")
print(mrpc_dataset["train"][1])

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}

{'sentence1': "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'sentence2': "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .", 'label': 0, 'idx': 1}


In [22]:
# filtered rows that match a specific condition
filtered_data = mrpc_dataset["train"].filter(lambda example: example["sentence1"].startswith("This"))
filtered_data[:-1]

                                                                    

{'sentence1': ['This Palm OS smart phone is the last product the company will release before it becomes a part of palmOne .',
  "This week 's tour will take Bush to Senegal , South Africa , Botswana , Uganda and Nigeria , and is aimed at softening his warrior image at home and abroad .",
  'This was around the time Congress was debating a resolution granting the President broad authority to wage war .',
  "This morning , at UM 's New York office , Coen revised his expectations downward , saying that spending would instead rise 4.6 percent to $ 247 billion .",
  'This is the only planet that has been found in orbit around a binary star system .',
  'This year , local health departments hired part-time water samplers and purchased testing equipment with a $ 282,355 grant from the Environmental Protection Agency .',
  'This was double the $ 818 million reported for the first three months of 2001 .',
  'This change in attitude gave upscale purveyors including Neiman Marcus , the parent of 

In [23]:
# train test split

mrpc_dataset["train"].train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3301
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 367
    })
})