In [102]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from plotly import graph_objs as go
from tqdm import tqdm

## Basic Example: Classify Sentences

In [103]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.", # 1
    "This course is disappointing.", # 0
    "This course is amazing!", # 1
    "This course is the worst.", # 0
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [104]:
display(tokenizer('hi'))
display(tokenizer('hi', 'there'))
tokenizer('hi', 'there', 'put') # 2 is the max, then it's counted as label (?)

{'input_ids': [101, 7632, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

{'input_ids': [101, 7632, 102, 2045, 102], 'token_type_ids': [0, 0, 0, 1, 1], 'attention_mask': [1, 1, 1, 1, 1]}

{'input_ids': [101, 7632, 102, 2045, 102], 'token_type_ids': [0, 0, 0, 1, 1], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [101, 2404, 102]}

In [105]:
for i, (input_id, token_type_id, attention_mask) in enumerate(zip(batch["input_ids"], batch["token_type_ids"], batch["attention_mask"])):
    print(f"Sequence {i}:")
    print(f"input_id: {input_id.tolist()}")
    print(f"token_type_id: {token_type_id.tolist()}")
    print(f"attention_mask: {attention_mask.tolist()}")
    print()

Sequence 0:
input_id: [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
token_type_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Sequence 1:
input_id: [101, 2023, 2607, 2003, 15640, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Sequence 2:
input_id: [101, 2023, 2607, 2003, 6429, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Sequence 3:
input_id: [101, 2023, 2607, 2003, 1996, 5409, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]



input_ids: encoded tokens

token_type_ids: binary mask; which tokens belong to text element 1 or 2 (here, we only have one element per sequence so all are 0)

attention_mask: which tokens should be attended to (if we have different length sequences in a batch, shorter ones contain zeros)

In [None]:
batch["labels"] = torch.tensor([1, 0, 1, 0]) # 1 is positive, 0 is negative

batch.to("mps")

model = model.to("mps")

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss = model(**batch).loss

print(f"Initial loss: {loss.item()}")

losses = []
for i in tqdm(range(30), desc=f"Training..."):
    optimizer.zero_grad()
    loss = model(**batch).loss
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

print(f"Final loss: {loss.item()}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(losses))), y=losses))
fig.show()

Initial loss: 0.6963284015655518


Training...: 100%|██████████| 30/30 [00:04<00:00,  6.75it/s]

Final loss: 0.04599013179540634



