# Putting it all together (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
# Instalasi library yang diperlukan
!pip install datasets evaluate transformers[sentencepiece]
# Mengimpor AutoTokenizer
from transformers import AutoTokenizer

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6

In [2]:
# Checkpoint model pralatih yang digunakan
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# Memuat tokenizer dari checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Contoh input teks
sequence = "I've been waiting for a HuggingFace course my whole life."

# Tokenisasi teks menjadi format model
model_inputs = tokenizer(sequence)  # Menghasilkan input numerik dari teks

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [3]:
# Tokenisasi untuk beberapa sequence
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Tokenisasi tanpa padding
model_inputs = tokenizer(sequences)

In [4]:
# Padding sequence hingga panjang terpanjang dalam batch
model_inputs = tokenizer(sequences, padding="longest")

# Padding sequence hingga panjang maksimum model (512 untuk BERT atau DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Padding sequence hingga panjang maksimum yang ditentukan pengguna
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [5]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Truncation: Memotong sequence yang lebih panjang dari panjang maksimum model
model_inputs = tokenizer(sequences, truncation=True)

# Truncation: Memotong sequence yang lebih panjang dari panjang maksimum yang ditentukan
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [6]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Mengubah sequence menjadi tensor PyTorch
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

# Mengubah sequence menjadi tensor TensorFlow
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

# Mengubah sequence menjadi array NumPy
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

In [7]:
# Contoh input teks lagi
sequence = "I've been waiting for a HuggingFace course my whole life."

# Tokenisasi sequence
model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])  # Menampilkan input numerik dari teks

# Tokenisasi menjadi token teks
tokens = tokenizer.tokenize(sequence)  # Tokenisasi menjadi sub-kata
ids = tokenizer.convert_tokens_to_ids(tokens)  # Mengonversi token menjadi ID numerik
print(ids)  # Menampilkan ID token

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [8]:
print(tokenizer.decode(model_inputs["input_ids"]))  # Mendekodekan kembali input numerik ke teks
print(tokenizer.decode(ids))  # Mendekodekan kembali ID token ke teks

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


In [9]:
# Mengimpor AutoTokenizer dan AutoModelForSequenceClassification
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Memuat tokenizer dan model dari checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Input sequence untuk batch
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Tokenisasi sequence dengan padding dan truncation
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")  # Menghasilkan tensor PyTorch

# Mendapatkan output model menggunakan input token
output = model(**tokens)  # Mendapatkan hasil klasifikasi dari model

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]