# Handling multiple sequences (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
# Instalasi library yang diperlukan
!pip install datasets evaluate transformers[sentencepiece]

# Import library yang diperlukan
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m875.9 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m

In [12]:
# Checkpoint model yang digunakan (model pre-trained)
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

# Memuat tokenizer dan model dari checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Input teks
sequence = "I've been waiting for a HuggingFace course my whole life."

# Tokenisasi teks menjadi token
tokens = tokenizer.tokenize(sequence)  # Menghasilkan token dari teks
ids = tokenizer.convert_tokens_to_ids(tokens)  # Mengonversi token menjadi ID numerik

# Menyusun tensor input IDs tanpa padding
input_ids = torch.tensor(ids)  # Tensor dari ID token
# Baris ini akan gagal karena tensor input harus dua dimensi (batching diperlukan)
#model(input_ids)

In [3]:
# Memperbaiki input tensor dengan menambahkan dimensi batch
tokenized_inputs = tokenizer(sequence, return_tensors="pt")  # Menambahkan batch dimensi
print(tokenized_inputs["input_ids"])  # Menampilkan tensor input yang sudah benar


tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [4]:
# Menyusun tensor input IDs dengan padding secara manual
input_ids = torch.tensor([ids])  # Membungkus ID token dalam batch
print("Input IDs:", input_ids)

# Mendapatkan output logits dari model
output = model(input_ids)  # Menjalankan input melalui model
print("Logits:", output.logits)  # Menampilkan output logits dari model


Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [5]:
# Contoh padding manual untuk batch input
batched_ids = [
    [200, 200, 200],  # Input pertama
    [200, 200],  # Input kedua tanpa padding
]


In [6]:
# Menambahkan padding ID untuk menyamakan panjang input
padding_id = 100
batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

In [7]:
# Memuat model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Menjalankan dua input secara batch
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],  # Padding menggunakan token khusus
]

# Mendapatkan logits untuk setiap input
print(model(torch.tensor(sequence1_ids)).logits)  # Output untuk sequence1
print(model(torch.tensor(sequence2_ids)).logits)  # Output untuk sequence2
print(model(torch.tensor(batched_ids)).logits)  # Output untuk batch dengan padding

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [8]:
# Membuat attention mask untuk mengindikasikan token mana yang diperhatikan
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]
attention_mask = [
    [1, 1, 1],  # Semua token diperhatikan
    [1, 1, 0],  # Padding tidak diperhatikan
]

# Menjalankan model dengan attention mask
outputs = model(
    torch.tensor(batched_ids),
    attention_mask=torch.tensor(attention_mask)  # Menyediakan attention mask
)
print(outputs.logits)  # Menampilkan logits

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [9]:
# Membatasi panjang input sequence ke nilai maksimum yang diperbolehkan
max_sequence_length = 512  # Contoh batas maksimum panjang sequence
sequence = sequence[:max_sequence_length]  # Memotong sequence jika melebihi panjang maksimum
