# Tokenizers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
# Instalasi library yang diperlukan
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.

In [2]:
# Tokenisasi teks secara manual
tokenized_text = "Jim Henson was a puppeteer".split()  # Memisahkan teks berdasarkan spasi
print(tokenized_text)  # Menampilkan daftar kata yang telah di-tokenisasi

['Jim', 'Henson', 'was', 'a', 'puppeteer']


In [3]:
# Mengimpor tokenizer BERT
from transformers import BertTokenizer

# Memuat tokenizer BERT pralatih
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# Mengimpor AutoTokenizer, opsi yang lebih generik untuk berbagai model
from transformers import AutoTokenizer

# Memuat tokenizer dengan AutoTokenizer untuk model "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [5]:
# Menggunakan tokenizer untuk memproses kalimat menjadi token numerik
print(tokenizer("Using a Transformer network is simple"))  # Menghasilkan token numerik untuk input teks

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [6]:
# Menyimpan tokenizer ke dalam direktori lokal
tokenizer.save_pretrained("directory_on_my_computer")  # Menyimpan tokenizer dalam folder "directory_on_my_computer"

('directory_on_my_computer/tokenizer_config.json',
 'directory_on_my_computer/special_tokens_map.json',
 'directory_on_my_computer/vocab.txt',
 'directory_on_my_computer/added_tokens.json',
 'directory_on_my_computer/tokenizer.json')

In [7]:
# Tokenisasi teks menggunakan tokenizer BERT
from transformers import AutoTokenizer

# Memuat tokenizer kembali
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Teks untuk tokenisasi
sequence = "Using a Transformer network is simple"
# Tokenisasi teks menjadi daftar token
tokens = tokenizer.tokenize(sequence)
print(tokens)  # Menampilkan daftar token hasil tokenisasi

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [8]:
# Mengonversi token menjadi ID token numerik
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)  # Menampilkan daftar ID token yang sesuai dengan token teks

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [9]:
# Mendekode ID token kembali menjadi string
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)  # Menampilkan string hasil dekode dari ID token


Using a transformer network is simple
