# Processing the data (PyTorch)



Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
# Instalasi library yang diperlukan
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.

In [2]:
# Import library yang diperlukan
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Checkpoint model pralatih
checkpoint = "bert-base-uncased"
# Memuat tokenizer dan model dari checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Contoh input data
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",  # Positif
    "This course is amazing!",  # Positif
]
# Tokenisasi dengan padding dan truncation, menghasilkan tensor PyTorch
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# Menambahkan label (1 untuk positif, dalam hal ini)
batch["labels"] = torch.tensor([1, 1])  # Tensor label

# Optimizer untuk pembaruan bobot model
optimizer = AdamW(model.parameters())
# Menghitung loss dari batch input
loss = model(**batch).loss
loss.backward()  # Backpropagation
optimizer.step()  # Pembaruan parameter model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:

# Mengimpor dataset GLUE (MRPC)
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")  # Memuat dataset GLUE task MRPC
print(raw_datasets)  # Menampilkan struktur dataset

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [4]:
# Mendapatkan data training dari dataset
raw_train_dataset = raw_datasets["train"]
print(raw_train_dataset[0])  # Menampilkan contoh pertama dari data training

# Menampilkan fitur dataset
print(raw_train_dataset.features)  # Struktur fitur dalam dataset

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


In [5]:
# Menampilkan fitur dataset
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [6]:
# Tokenisasi kalimat menggunakan tokenizer
from transformers import AutoTokenizer

# Memuat tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenisasi masing-masing kalimat
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])


In [7]:
# Tokenisasi pasangan kalimat
inputs = tokenizer("This is the first sentence.", "This is the second one.")
print(inputs)  # Menampilkan hasil tokenisasi

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
# Mengonversi ID token menjadi token teks
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']


In [9]:
# Tokenisasi seluruh dataset dengan padding dan truncation
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [10]:
# Fungsi untuk tokenisasi dataset secara batched
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [11]:
# Menerapkan tokenisasi ke seluruh dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)  # Menampilkan dataset yang telah di-tokenisasi

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [12]:
# Menggunakan data collator untuk padding dinamis
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
# Mengambil sampel data untuk demontrasi
samples = tokenized_datasets["train"][:8]
# Menghapus kolom yang tidak relevan
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}

In [14]:
# Menampilkan panjang setiap input ID
print([len(x) for x in samples["input_ids"]])

# Membuat batch menggunakan data collator
batch = data_collator(samples)
# Menampilkan dimensi setiap elemen dalam batch
print({k: v.shape for k, v in batch.items()})

[50, 59, 47, 67, 59, 50, 62, 32]
{'input_ids': torch.Size([8, 67]), 'token_type_ids': torch.Size([8, 67]), 'attention_mask': torch.Size([8, 67]), 'labels': torch.Size([8])}
