In [74]:
try:
  from transformer import pipeline
except Exception as e:
  print(e)
  print("installing...")
  !pip install datasets transformers[sentencepiece] -q
  from transformers import pipeline

No module named 'transformer'
installing...


In [75]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [76]:
model_name = "bert-base-cased"

In [77]:
from transformers import BertTokenizer, AutoTokenizer

In [78]:
tokenizer = BertTokenizer.from_pretrained(model_name)
auto_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [79]:
example = "Transformer is an interesting library"

In [80]:
tokenizer(example)

{'input_ids': [101, 13809, 23763, 1110, 1126, 5426, 3340, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [81]:
auto_tokenizer(example)

{'input_ids': [101, 13809, 23763, 1110, 1126, 5426, 3340, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [82]:
tokenizer.save_pretrained("./sample_tokenizer")

('./sample_tokenizer/tokenizer_config.json',
 './sample_tokenizer/special_tokens_map.json',
 './sample_tokenizer/vocab.txt',
 './sample_tokenizer/added_tokens.json')

In [83]:
loaded_tokenizer = tokenizer.from_pretrained("./sample_tokenizer")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [84]:
loaded_tokenizer(example)

{'input_ids': [101, 13809, 23763, 1110, 1126, 5426, 3340, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [85]:
auto_tokenizer.tokenize(example)

['Trans', '##former', 'is', 'an', 'interesting', 'library']

In [86]:
tokens = tokenizer.tokenize(example)
tokens

['Trans', '##former', 'is', 'an', 'interesting', 'library']

In [87]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[13809, 23763, 1110, 1126, 5426, 3340]

In [88]:
tokenizer.convert_ids_to_tokens(ids)

['Trans', '##former', 'is', 'an', 'interesting', 'library']

In [89]:
updated_ids = [101, 13809, 23763, 1110, 1126, 5426, 3340, 102]

In [90]:
tokenizer.convert_ids_to_tokens(updated_ids)

['[CLS]', 'Trans', '##former', 'is', 'an', 'interesting', 'library', '[SEP]']

In [91]:
updated_ids = [101, 13809, 23763, 1110, 1126, 5426, 3340, 102, 0, 0]

In [92]:
tokenizer.convert_ids_to_tokens(updated_ids)

['[CLS]',
 'Trans',
 '##former',
 'is',
 'an',
 'interesting',
 'library',
 '[SEP]',
 '[PAD]',
 '[PAD]']

In [93]:
tokenizer.decode(updated_ids)

'[CLS] Transformer is an interesting library [SEP] [PAD] [PAD]'

In [94]:
tokenizer.decode(ids)

'Transformer is an interesting library'

## Handling multiple sequences

In [95]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [96]:
ckpt = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForSequenceClassification.from_pretrained(ckpt)

In [97]:
example = "I've been exploring transformers for Natural Language Processing"

In [98]:
tokens = tokenizer.tokenize(example)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
input_ids

tensor([ 1045,  1005,  2310,  2042, 11131, 19081,  2005,  3019,  2653,  6364])

In [99]:
'model(input_ids)

IndexError: ignored

In [101]:
torch.unsqueeze(input_ids, dim=0)

tensor([[ 1045,  1005,  2310,  2042, 11131, 19081,  2005,  3019,  2653,  6364]])

In [102]:
model(torch.unsqueeze(input_ids, dim=0))

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.0319, -0.8272]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [103]:
tokenized_inputs = tokenizer(example, return_tensors='pt')
tokenized_inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042, 11131, 19081,  2005,  3019,  2653,
          6364,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [104]:
tokenized_inputs["input_ids"]

tensor([[  101,  1045,  1005,  2310,  2042, 11131, 19081,  2005,  3019,  2653,
          6364,   102]])

In [105]:
model(tokenized_inputs["input_ids"])

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3594,  0.4591]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [106]:
tokenizer.pad_token_id

0

In [107]:
example_1 = "I've been exploring Transformers for Natural Language Processing"
example_2 = "I've been exploring Transformers for fun"

In [109]:
token_1 = tokenizer.tokenize(example_1)
print(token_1)
token_2 = tokenizer.tokenize(example_2)
print(token_2)

['i', "'", 've', 'been', 'exploring', 'transformers', 'for', 'natural', 'language', 'processing']
['i', "'", 've', 'been', 'exploring', 'transformers', 'for', 'fun']


In [129]:
id_1 = tokenizer.convert_tokens_to_ids(token_1)
id_2 = tokenizer.convert_tokens_to_ids(token_2)

In [130]:
batch_id = [
    id_1,
    id_2
]
batch_id

[[1045, 1005, 2310, 2042, 11131, 19081, 2005, 3019, 2653, 6364],
 [1045, 1005, 2310, 2042, 11131, 19081, 2005, 4569]]

In [131]:
model(torch.tensor([id_1])).logits

tensor([[ 1.0319, -0.8272]], grad_fn=<AddmmBackward0>)

In [132]:
model(torch.tensor([id_2])).logits

tensor([[-0.1267,  0.3749]], grad_fn=<AddmmBackward0>)

In [133]:
model(torch.tensor([batch_id])).logits

ValueError: ignored

In [134]:
batch_id_with_same_len = [
    id_1,
    id_1
]

In [135]:
model(torch.tensor(batch_id_with_same_len)).logits

tensor([[ 1.0319, -0.8272],
        [ 1.0319, -0.8272]], grad_fn=<AddmmBackward0>)

In [136]:
len(id_1)

10

In [137]:
max_len = 10
current_len = len(id_2)

while len(id_2) != max_len:
  id_2.append(tokenizer.pad_token_id)

In [138]:
len(id_2)

10

In [139]:
id_2

[1045, 1005, 2310, 2042, 11131, 19081, 2005, 4569, 0, 0]

In [140]:
batch_id = [
    id_1,
    id_2
]
batch_id

[[1045, 1005, 2310, 2042, 11131, 19081, 2005, 3019, 2653, 6364],
 [1045, 1005, 2310, 2042, 11131, 19081, 2005, 4569, 0, 0]]

In [141]:
model(torch.tensor(batch_id)).logits

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.0319, -0.8272],
        [ 1.6357, -1.3086]], grad_fn=<AddmmBackward0>)

In [142]:
[1]*8 + [0]*2

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]

In [143]:
attention_mask = [
              [1]*10,
              [1]*8 + [0]*2
]
attention_mask

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]

In [145]:
model(
    torch.tensor(batch_id),
    attention_mask = torch.tensor(attention_mask)
).logits

tensor([[ 1.0319, -0.8272],
        [-0.1267,  0.3749]], grad_fn=<AddmmBackward0>)