In [1]:
pip install transformers[torch] datasets evaluate rouge_score tqdm wandb accelerate openpyxl sentencepiece protobuf ipywidgets

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=351e1a0df37e808d5a6b361c63d66ebb52ac6e740e5f887369e121e192078103
  Stored in directory: /root/.cache/pip/whe

In [4]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

# 1. Konfigurasi
MODEL_CHECKPOINT = "t5-base"  # Model pre-trained
MAX_INPUT_LENGTH = 512        # Panjang maksimal input (konteks + pertanyaan)
MAX_TARGET_LENGTH = 32        # Panjang maksimal output (jawaban)

# 2. Load Dataset SQuAD
# SQuAD berisi kolom: 'id', 'title', 'context', 'question', 'answers'
raw_datasets = load_dataset("squad")
print("Contoh data:", raw_datasets["train"][0])

# 3. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
print("Berhasil import dan load dataset!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Contoh data: {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Berhasil import dan load dataset!


In [5]:
def preprocess_function(examples):
    inputs = []
    targets = []

    # Loop melalui setiap data
    for i in range(len(examples["context"])):
        # Format input khusus T5
        input_text = f"question: {examples['question'][i]} context: {examples['context'][i]}"
        inputs.append(input_text)

        # Ambil teks jawaban pertama (SQuAD bisa punya beberapa referensi jawaban, ambil yang pertama untuk training)
        targets.append(examples["answers"][i]["text"][0])

    # Tokenisasi Input
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length")

    # Tokenisasi Target (Jawaban)
    labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")

    # Ganti padding token di label menjadi -100 agar tidak dihitung dalam loss
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Terapkan preprocessing ke seluruh dataset (Train & Validation)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [6]:
# 1. Load Model T5
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

# 2. Setup Argumen Training
args = Seq2SeqTrainingArguments(
    output_dir="./t5-finetuned-squad",
    # evaluation_strategy="epoch",      # Evaluasi setiap akhir epoch (removed due to error)
    # save_strategy="epoch",          # Simpan model setiap akhir epoch (removed due to error)
    learning_rate=2e-5,               # Learning rate standar untuk fine-tuning
    per_device_train_batch_size=8,   # Sesuaikan dengan memori GPU (bisa diturunkan ke 8 atau 4)
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,               # Hanya simpan 3 checkpoint terakhir
    num_train_epochs=2,               # Jumlah putaran latihan
    predict_with_generate=True,       # Penting: Generate teks saat evaluasi untuk hitung metrik
    fp16=True,                        # Gunakan Mixed Precision (lebih cepat di GPU modern)
    push_to_hub=False,
    # load_best_model_at_end=True,      # Muat model terbaik di akhir training (removed due to error)
)

# 3. Data Collator (Mengurus batching secara dinamis)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 4. Inisialisasi Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


In [5]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
500,0.2422
1000,0.2385
1500,0.2374
2000,0.236
2500,0.2397
3000,0.236
3500,0.2289
4000,0.2465
4500,0.238
5000,0.2502


TrainOutput(global_step=21900, training_loss=0.25392941966993077, metrics={'train_runtime': 16094.7938, 'train_samples_per_second': 10.885, 'train_steps_per_second': 1.361, 'total_flos': 1.0668820451033088e+17, 'train_loss': 0.25392941966993077, 'epoch': 2.0})

In [6]:
# Fungsi untuk prediksi manual
def ask_question(question, context):
    # 1. Format input
    input_text = f"question: {question} context: {context}"

    # 2. Tokenisasi
    inputs = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") # Pastikan ke GPU jika pakai GPU

    # 3. Generate Jawaban
    outputs = model.generate(inputs, max_length=32)

    # 4. Decode hasil token menjadi teks
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# --- CONTOH PENGGUNAAN ---
my_context = """
Monas atau Monumen Nasional adalah ikon kota Jakarta yang terletak di pusat kota.
Tugu ini dibangun untuk mengenang perlawanan dan perjuangan rakyat Indonesia untuk merebut kemerdekaan
dari pemerintahan kolonial Hindia Belanda. Pembangunan dimulai pada tanggal 17 Agustus 1961.
"""

my_question = "Kapan pembangunan Monas dimulai?"

print("Konteks:", my_context)
print("Pertanyaan:", my_question)
print("-" * 30)
print("Jawaban Model:", ask_question(my_question, my_context))

Konteks: 
Monas atau Monumen Nasional adalah ikon kota Jakarta yang terletak di pusat kota.
Tugu ini dibangun untuk mengenang perlawanan dan perjuangan rakyat Indonesia untuk merebut kemerdekaan
dari pemerintahan kolonial Hindia Belanda. Pembangunan dimulai pada tanggal 17 Agustus 1961.

Pertanyaan: Kapan pembangunan Monas dimulai?
------------------------------
Jawaban Model: pada tanggal 17 Agustus 1961
