# **Step 0: Setup & Installation**

In [None]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

PyTorch version: 2.9.0+cu126
CUDA available: True
Device: cuda


In [None]:
!pip install -q transformers datasets accelerate evaluate sentencepiece

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)
import evaluate
import torch
import numpy as np

In [None]:
from datasets import load_dataset

squad = load_dataset("rajpurkar/squad")
train_ds = squad["train"]
valid_ds = squad["validation"]

small_train = train_ds.select(range(5000))      # atau size lain yang kamu mau
small_valid = valid_ds.select(range(1000))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

# Step 1: Load Dataset **SQuAD**

In [None]:
from datasets import load_dataset

squad = load_dataset("rajpurkar/squad")
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
small_train = squad["train"].select(range(5000))      # 5k contoh pertama
small_valid = squad["validation"].select(range(1000)) # 1k contoh pertama

len(small_train), len(small_valid)

(5000, 1000)

In [None]:
example = squad["train"][0]
example

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [None]:
print("ID      :", example["id"])
print("Title   :", example["title"])
print("Question:", example["question"])
print("Context :", example["context"][:400], "...")
print("Answers :", example["answers"])

ID      : 5733be284776f41900661182
Title   : University_of_Notre_Dame
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Context : Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of p ...
Answers : {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


# Step 2: Exploratory Data Analysis (EDA)**bold text**

In [None]:
print(squad)

print("Train size      :", len(squad["train"]))
print("Validation size :", len(squad["validation"]))

print(squad["train"].column_names)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
Train size      : 87599
Validation size : 10570
['id', 'title', 'context', 'question', 'answers']


In [None]:
from random import randrange

for _ in range(3):
    idx = randrange(len(squad["train"]))
    ex = squad["train"][idx]
    print("="*80)
    print("Question:", ex["question"])
    print("Context :", ex["context"][:300], "...")
    print("Answers :", ex["answers"])

Question: Is the public given incentive to report Endangered Species Act violations?
Context : A reward will be paid to any person who furnishes information which leads to an arrest, conviction, or revocation of a license, so long as they are not a local, state, or federal employee in the performance of official duties. The Secretary may also provide reasonable and necessary costs incurred fo ...
Answers : {'text': ['A reward will be paid to any person who furnishes information which leads to an arrest, conviction, or revocation of a license'], 'answer_start': [0]}
Question: What is the study of algae called?
Context : The strictest definition of "plant" includes only the "land plants" or embryophytes, which include seed plants (gymnosperms, including the pines, and flowering plants) and the free-sporing cryptogams including ferns, clubmosses, liverworts, hornworts and mosses. Embryophytes are multicellular eukary ...
Answers : {'text': ['phycology'], 'answer_start': [921]}
Question: H

In [None]:
def get_lengths(dataset, num_samples=1000):
    questions_len = []
    contexts_len = []
    for i in range(num_samples):
        ex = dataset[i]
        questions_len.append(len(ex["question"].split()))
        contexts_len.append(len(ex["context"].split()))
    return questions_len, contexts_len

q_len, c_len = get_lengths(squad["train"], num_samples=1000)

print("Avg question length:", sum(q_len) / len(q_len))
print("Avg context length :", sum(c_len) / len(c_len))
print("Max question length:", max(q_len))
print("Max context length :", max(c_len))

Avg question length: 10.361
Avg context length : 142.815
Max question length: 26
Max context length : 326


In [None]:
ex = squad["train"][0]
input_str = f"question: {ex['question']} context: {ex['context']}"
target_str = ex["answers"]["text"][0]
print(input_str[:200], "...")
print("TARGET:", target_str)

question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue o ...
TARGET: Saint Bernadette Soubirous


# Step 3: Preprocessing & Tokenization (dengan T5).**bold text**

In [None]:
from transformers import T5TokenizerFast

model_checkpoint = "t5-base"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint)

max_input_length = 512   # panjang maksimum untuk question+context
max_target_length = 32   # panjang maksimum jawaban

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = []
    targets = []
    for question, context, answers in zip(
        examples["question"],
        examples["context"],
        examples["answers"],
    ):
        # format text-to-text T5
        input_str = f"question: {question} context: {context}"
        # ambil jawaban pertama
        answer_text = answers["text"][0] if len(answers["text"]) > 0 else ""

        inputs.append(input_str)
        targets.append(answer_text)

    # tokenisasi input
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
    )

    # tokenisasi target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train = small_train.map(
    preprocess_function,
    batched=True,
    remove_columns=small_train.column_names,
)

tokenized_valid = small_valid.map(
    preprocess_function,
    batched=True,
    remove_columns=small_valid.column_names,
)

tokenized_train[0]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'input_ids': [822,
  10,
  304,
  4068,
  410,
  8,
  16823,
  3790,
  3,
  18280,
  2385,
  16,
  507,
  3449,
  16,
  301,
  1211,
  1395,
  1410,
  58,
  2625,
  10,
  30797,
  120,
  6,
  8,
  496,
  65,
  3,
  9,
  6502,
  1848,
  5,
  71,
  2916,
  8,
  5140,
  5450,
  31,
  7,
  2045,
  22161,
  19,
  3,
  9,
  7069,
  12647,
  13,
  8,
  16823,
  3790,
  5,
  3,
  29167,
  16,
  851,
  13,
  8,
  5140,
  5450,
  11,
  5008,
  34,
  6,
  19,
  3,
  9,
  8658,
  12647,
  13,
  2144,
  28,
  6026,
  3,
  76,
  24266,
  28,
  8,
  9503,
  96,
  553,
  15,
  7980,
  1980,
  1212,
  13285,
  1496,
  1280,
  3021,
  12,
  8,
  5140,
  5450,
  19,
  8,
  23711,
  2617,
  13,
  8,
  3,
  24756,
  6219,
  5,
  3,
  29167,
  1187,
  8,
  20605,
  2617,
  19,
  8,
  8554,
  17,
  235,
  6,
  3,
  9,
  17535,
  286,
  13,
  7029,
  11,
  9619,
  5,
  94,
  19,
  3,
  9,
  16455,
  13,
  8,
  3,
  3844,
  17,
  235,
  44,
  301,
  1211,
  1395,
  6,
  1410,
  213,
  8,
  16823,
  3790,
  3,

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
save_path = "/content/drive/MyDrive/T5_QA_SQuAD_tokenized"

tokenized_train.save_to_disk(save_path + "/train")
tokenized_valid.save_to_disk(save_path + "/valid")

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]