In [1]:
# Word-based / Each word has a specific ID
# out of vocabulary words result in loss of information!
tokenized_sentence = ("Karpathy is a legit researcher!").split()
print(tokenized_sentence)

['Karpathy', 'is', 'a', 'legit', 'researcher!']


In [13]:
# One way to reduce the amount of unknown tokens: character-based tokenizer.
# Subword-based tokenization:  frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer("trying bert felt cute. might delete later")

{'input_ids': [101, 1774, 1129, 3740, 1464, 10509, 119, 1547, 3687, 16618, 1224, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
# Agnostic way to do so
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer("i'm trying the tokenizer right now. can't talk baby!")

{'input_ids': [101, 1045, 1005, 1049, 2667, 1996, 19204, 17629, 2157, 2085, 1012, 2064, 1005, 1056, 2831, 3336, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.save_pretrained("desktop/weights-in-bio")

In [26]:
# How it really works?
# 1st step: split text to tokens
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("i'm trying the tokenizer right now. can't talk baby!")
print(tokens)

['i', "'", 'm', 'trying', 'the', 'token', '##izer', 'right', 'now', '.', 'can', "'", 't', 'talk', 'baby', '!']


In [27]:
inputs_ids = tokenizer.convert_tokens_to_ids(tokens)
print(inputs_ids)

[1045, 1005, 1049, 2667, 1996, 19204, 17629, 2157, 2085, 1012, 2064, 1005, 1056, 2831, 3336, 999]


In [36]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer("i'm trying the tokenizer right now. can't talk baby!")
print(tokenizer.decode(inputs["input_ids"]))


[CLS] i'm trying the tokenizer right now. can't talk baby! [SEP]


In [None]:
# Handling multiple sequences
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "DeepMind and OpenAI and xAI and groq are one of the most legit companies i've ever seen!"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
inputs_ids = torch.tensor(ids)

model(inputs_ids) # This will fail

In [3]:
## Previous one failed because we sent single sequence to the model, whereas 🤗 Transformers models expect multiple sentences by default. 
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])
# tokenizer didn’t just convert the list of input IDs into a tensor, it added a dimension on top of it

tensor([[  101,  2784, 23356,  1998,  2330,  4886,  1998,  1060,  4886,  1998,
         24665,  2080,  4160,  2024,  2028,  1997,  1996,  2087,  4190,  4183,
          3316,  1045,  1005,  2310,  2412,  2464,   999,   102]])


In [4]:
# Trying again by adding a new dimension
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "DeepMind and OpenAI and xAI and groq are one of the most legit companies i've ever seen!"

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 2784, 23356,  1998,  2330,  4886,  1998,  1060,  4886,  1998, 24665,
          2080,  4160,  2024,  2028,  1997,  1996,  2087,  4190,  4183,  3316,
          1045,  1005,  2310,  2412,  2464,   999]])
Logits: tensor([[-3.3088,  3.5042]], grad_fn=<AddmmBackward0>)


In [6]:
# Batching is the act of sending multiple sentences through the model, all at once.
batched_ids = [ids, ids]
# Batching allows the model to work when you feed it w/ multiple sentences.

In [5]:
# However, there may be length differences between texts thus you can't convert the list of input IDs into a tensor directly.
# Thus, we 'pad' the inputs to solve that problem.
batched_ids = [
    [200, 200, 200],
    [200, 200]
] # this can't be converted to a tensor!

# Making sure that all of our sentences have the same lenght by adding the 'padding token' to the sentences w/ fewer values.
padding_id = 100
batched_ids = [
    [200,200,200],
    [200,200, padding_id],
]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200,200,200]]
sequence2_ids = [[200,200]]
batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]
print(batched_ids)
print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

[[200, 200, 200], [200, 200, 0]]
tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [9]:
# Attention Masks
# An attention mask is a binary mask that indicates which tokens in the input sequence should be attended to and which ones should be ignored.
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [10]:
# Most models have sequence limitations like up to 512 or 1024 tokens and will crash after that. Solution is to use a model w/longer lenght or truncate sequences
max_sequence_length = 1283
sequence = sequence[:max_sequence_length]