In [1]:
pip install transformers[torch] datasets evaluate rouge_score tqdm wandb accelerate openpyxl sentencepiece protobuf ipywidgets

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d5a214f05e046c7a3a782753fbeeeed34f858c305e2353ee59f478f487d876d4
  Stored in directory: /root/.cache/pip/whe

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

In [4]:
# Cek ketersediaan GPU (misalnya T4 di Google Colab)
if torch.cuda.is_available():                                                   # cek apakah CUDA/GPU aktif [web:2][web:6]
    device = torch.device("cuda")                                               # set device ke GPU [web:6]
    print("CUDA tersedia, menggunakan device:", device)                         # info penggunaan GPU
    print("Nama GPU :", torch.cuda.get_device_name(0))                          # tampilkan nama GPU (contoh: Tesla T4) [web:7][web:9]
else:
    device = torch.device("cpu")                                                # fallback ke CPU jika tidak ada GPU [web:6][web:12]
    print("CUDA tidak tersedia, menggunakan CPU:", device)                      # info penggunaan CPU

CUDA tersedia, menggunakan device: cuda
Nama GPU : Tesla T4


In [5]:
# 1. Konfigurasi dasar
BASE_MODEL = "t5-base"                      # nama model pre-trained yang akan digunakan
MAX_SOURCE_LEN = 512                        # panjang maksimum teks input (context + question)
MAX_TARGET_LEN = 32                         # panjang maksimum teks output (jawaban)

# 2. Memuat dataset SQuAD
# SQuAD berisi: 'id', 'title', 'context', 'question', 'answers'
dataset_squad = load_dataset("squad")       # download dan load dataset SQuAD
print("Contoh satu sampel:", dataset_squad["train"][0])  # tampilkan contoh baris pertama

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Contoh satu sampel: {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [6]:
# 3. Memuat tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)    # inisialisasi tokenizer dari model dasar
print("Tokenizer dan dataset berhasil dimuat!")          # pesan konfirmasi

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Tokenizer dan dataset berhasil dimuat!


In [7]:
def preprocess_samples(batch):                                                    # fungsi untuk preprocessing satu batch data
    input_texts = []                                                              # list untuk menampung teks input ke model
    target_texts = []                                                             # list untuk menampung teks jawaban (label)

    # loop setiap contoh pada batch
    for idx in range(len(batch["context"])):                                      # iterasi sepanjang jumlah context
        # format input sesuai gaya T5: gabung question + context
        src = f"question: {batch['question'][idx]} context: {batch['context'][idx]}"
        input_texts.append(src)                                                   # simpan string input

        # ambil jawaban pertama dari list answers SQuAD sebagai target
        answer_text = batch["answers"][idx]["text"][0]                            # SQuAD bisa punya beberapa jawaban
        target_texts.append(answer_text)                                          # simpan jawaban ke list target

    # tokenisasi input (pertanyaan + konteks)
    model_inputs = tokenizer(                                                     # tokenisasi teks input
        input_texts,
        max_length=MAX_SOURCE_LEN,                                                # batas panjang input
        truncation=True,                                                          # potong jika melebihi max_length
        padding="max_length"                                                      # pad hingga max_length
    )

    # tokenisasi target (jawaban)
    label_tokens = tokenizer(                                                     # tokenisasi teks jawaban
        target_texts,
        max_length=MAX_TARGET_LEN,                                                # batas panjang output
        truncation=True,
        padding="max_length"
    )

    # ganti padding token di label menjadi -100 agar diabaikan saat menghitung loss
    label_tokens["input_ids"] = [                                                 # ubah setiap sequence label
        [(tok if tok != tokenizer.pad_token_id else -100)                         # pad_token -> -100
         for tok in seq]
        for seq in label_tokens["input_ids"]
    ]

    model_inputs["labels"] = label_tokens["input_ids"]                            # tambahkan kolom labels ke input
    return model_inputs                                                           # kembalikan dict siap untuk training

In [10]:
# Terapkan preprocessing ke seluruh dataset (train & validation)
tokenized_datasets = dataset_squad.map(                                           # mapping fungsi preprocessing ke dataset [web:23][web:26]
    preprocess_samples,                                                          # fungsi yang didefinisikan di atas
    batched=True,                                                                # proses per batch, bukan per satu contoh
    remove_columns=dataset_squad["train"].column_names                            # hapus kolom asli karena sudah diproses [web:23][web:26]
)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [11]:
# 1. Memuat model T5
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)          # load model seq2seq T5 dari checkpoint pre-trained [web:35]
model = model.to(device)                                         # opsional: pindahkan model ke GPU/CPU yang sudah dicek sebelumnya

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
# 2. Menyiapkan argumen pelatihan
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-squad-finetune",                              # folder untuk menyimpan checkpoint & log
    # evaluation_strategy="epoch",                                 # bisa diaktifkan jika ingin evaluasi tiap epoch [web:41]
    # save_strategy="epoch",                                       # bisa simpan model tiap epoch
    learning_rate=2e-5,                                            # laju belajar umum untuk fine-tuning T5 [web:35]
    per_device_train_batch_size=8,                                 # batch size per device saat training
    per_device_eval_batch_size=8,                                  # batch size per device saat evaluasi
    weight_decay=0.01,                                             # regularisasi untuk mencegah overfitting
    save_total_limit=3,                                            # hanya simpan maksimal 3 checkpoint terbaru
    num_train_epochs=2,                                            # jumlah epoch pelatihan
    predict_with_generate=True,                                    # pakai generate() saat evaluasi (untuk teks seq2seq) [web:32][web:39]
    fp16=True,                                                     # mixed precision, mempercepat di GPU yang mendukung [web:39][web:45]
    push_to_hub=False,                                             # tidak upload ke Hugging Face Hub
    # load_best_model_at_end=True,                                 # aktifkan jika pakai evaluation/save strategy [web:41]
)

In [13]:
# 3. Data collator untuk seq2seq
data_collator = DataCollatorForSeq2Seq(                            # helper untuk padding dinamis input & label [web:40][web:43]
    tokenizer=tokenizer,                                           # tokenizer yang dipakai model
    model=model                                                    # model, supaya collator tahu panjang & label ignore_index
)

In [14]:
# 4. Inisialisasi Trainer
trainer = Seq2SeqTrainer(
    model=model,                                                   # model T5 yang akan di-fine-tune [web:42]
    args=training_args,                                            # konfigurasi pelatihan
    train_dataset=tokenized_datasets["train"],                     # dataset latih yang sudah di-tokenisasi
    eval_dataset=tokenized_datasets["validation"],                 # dataset validasi yang sudah di-tokenisasi
    data_collator=data_collator,                                   # collator untuk membuat batch siap pakai
    tokenizer=tokenizer,                                           # tokenizer untuk decoding hasil generate
    # compute_metrics=compute_metrics,                             # opsional: fungsi untuk hitung metrik evaluasi [web:42][web:44]
)

  trainer = Seq2SeqTrainer(


In [15]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrylsatriaa[0m ([33mrylsatriaa-telkom-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.2422
1000,0.2385
1500,0.2374
2000,0.236
2500,0.2397
3000,0.236
3500,0.2289
4000,0.2465
4500,0.238
5000,0.2502


TrainOutput(global_step=21900, training_loss=0.25392941966993077, metrics={'train_runtime': 16527.2935, 'train_samples_per_second': 10.601, 'train_steps_per_second': 1.325, 'total_flos': 1.0668820451033088e+17, 'train_loss': 0.25392941966993077, 'epoch': 2.0})

In [16]:
# Fungsi untuk prediksi manual
def ask_question(question, context):                                              # fungsi helper untuk tanya-jawab ke model
    # 1. Format input
    input_text = f"question: {question} context: {context}"                       # susun input sesuai format T5

    # 2. Tokenisasi
    inputs = tokenizer(                                                           # ubah teks ke token id
        input_text,
        return_tensors="pt"
    ).input_ids.to(device)                                                        # kirim ke CPU/GPU sesuai variabel device [web:20][web:57]

    # 3. Generate Jawaban
    outputs = model.generate(                                                     # generate jawaban dari model [web:20]
        inputs,
        max_length=32
    )

    # 4. Decode hasil token menjadi teks
    answer = tokenizer.decode(                                                    # ubah kembali token ke string
        outputs[0],
        skip_special_tokens=True                                                  # buang token spesial (seperti </s>) [web:54][web:57]
    )
    return answer

In [17]:
# --- CONTOH PENGGUNAAN BARU ---
demo_context = """
Gunung Tangkuban Parahu adalah salah satu gunung berapi yang terletak di utara Kota Bandung.
Gunung ini menjadi tujuan wisata populer karena pemandangannya yang indah dan akses yang relatif mudah.
Letusan terakhirnya tercatat terjadi pada awal abad ke-21.
"""                                                                              # konteks baru tentang Tangkuban Parahu

demo_question = "Di mana letak Gunung Tangkuban Parahu?"                         # pertanyaan baru terkait konteks

print("Konteks:", demo_context)
print("Pertanyaan:", demo_question)
print("-" * 30)
print("Jawaban Model:", ask_question(demo_question, demo_context))               # panggil fungsi tanya-jawab


Konteks: 
Gunung Tangkuban Parahu adalah salah satu gunung berapi yang terletak di utara Kota Bandung.
Gunung ini menjadi tujuan wisata populer karena pemandangannya yang indah dan akses yang relatif mudah.
Letusan terakhirnya tercatat terjadi pada awal abad ke-21.

Pertanyaan: Di mana letak Gunung Tangkuban Parahu?
------------------------------
Jawaban Model: utara Kota Bandung
