In [None]:
pip install transformers[torch] datasets evaluate rouge_score tqdm wandb accelerate openpyxl sentencepiece protobuf ipywidgets

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

# 1. Konfigurasi
MODEL_CHECKPOINT = "t5-base"  # Model pre-trained
MAX_INPUT_LENGTH = 512        # Panjang maksimal input (konteks + pertanyaan)
MAX_TARGET_LENGTH = 32        # Panjang maksimal output (jawaban)

# 2. Load Dataset SQuAD
# SQuAD berisi kolom: 'id', 'title', 'context', 'question', 'answers'
raw_datasets = load_dataset("squad")
print("Contoh data:", raw_datasets["train"][0])

# 3. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
print("Berhasil import dan load dataset!")

In [None]:
def preprocess_function(examples):
    inputs = []
    targets = []

    # Loop melalui setiap data
    for i in range(len(examples["context"])):
        # Format input khusus T5
        input_text = f"question: {examples['question'][i]} context: {examples['context'][i]}"
        inputs.append(input_text)

        # Ambil teks jawaban pertama (SQuAD bisa punya beberapa referensi jawaban, ambil yang pertama untuk training)
        targets.append(examples["answers"][i]["text"][0])

    # Tokenisasi Input
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length")

    # Tokenisasi Target (Jawaban)
    labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")

    # Ganti padding token di label menjadi -100 agar tidak dihitung dalam loss
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Terapkan preprocessing ke seluruh dataset (Train & Validation)
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

In [None]:
# 1. Load Model T5
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

# 2. Setup Argumen Training
args = Seq2SeqTrainingArguments(
    output_dir="./t5-finetuned-squad",
    # evaluation_strategy="epoch",      # Evaluasi setiap akhir epoch (removed due to error)
    # save_strategy="epoch",          # Simpan model setiap akhir epoch (removed due to error)
    learning_rate=2e-5,               # Learning rate standar untuk fine-tuning
    per_device_train_batch_size=8,   # Sesuaikan dengan memori GPU (bisa diturunkan ke 8 atau 4)
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,               # Hanya simpan 3 checkpoint terakhir
    num_train_epochs=2,               # Jumlah putaran latihan
    predict_with_generate=True,       # Penting: Generate teks saat evaluasi untuk hitung metrik
    fp16=True,                        # Gunakan Mixed Precision (lebih cepat di GPU modern)
    push_to_hub=False,
    # load_best_model_at_end=True,      # Muat model terbaik di akhir training (removed due to error)
)

# 3. Data Collator (Mengurus batching secara dinamis)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 4. Inisialisasi Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
# Fungsi untuk prediksi manual
def ask_question(question, context):
    # 1. Format input
    input_text = f"question: {question} context: {context}"

    # 2. Tokenisasi
    inputs = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") # Pastikan ke GPU jika pakai GPU

    # 3. Generate Jawaban
    outputs = model.generate(inputs, max_length=32)

    # 4. Decode hasil token menjadi teks
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# --- CONTOH PENGGUNAAN ---
my_context = """
Monas atau Monumen Nasional adalah ikon kota Jakarta yang terletak di pusat kota.
Tugu ini dibangun untuk mengenang perlawanan dan perjuangan rakyat Indonesia untuk merebut kemerdekaan
dari pemerintahan kolonial Hindia Belanda. Pembangunan dimulai pada tanggal 17 Agustus 1961.
"""

my_question = "Kapan pembangunan Monas dimulai?"

print("Konteks:", my_context)
print("Pertanyaan:", my_question)
print("-" * 30)
print("Jawaban Model:", ask_question(my_question, my_context))