In [1]:
!pip install -q transformers>=4.35 datasets evaluate accelerate rouge_score bitsandbytes peft

In [2]:
import math, random, requests, torch, numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
import evaluate
from peft import LoraConfig, get_peft_model


device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device, "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

Using device: cuda | GPU: Tesla T4


In [3]:
BASE_URL = "https://datasets-server.huggingface.co/rows"

def fetch_split_from_server(split: str, total: int, batch_size: int = 100):
    assert batch_size <= 100
    docs, sums, ids = [], [], []
    num_batches = math.ceil(total / batch_size)
    for b in range(num_batches):
        offset = b * batch_size
        length = min(batch_size, total - offset)
        if length <= 0:
            break
        params = {
            "dataset": "EdinburghNLP/xsum",
            "config": "default",
            "split": split,
            "offset": offset,
            "length": length,
        }
        r = requests.get(BASE_URL, params=params)
        r.raise_for_status()
        data = r.json()
        for row in data["rows"]:
            docs.append(row["row"]["document"])
            sums.append(row["row"]["summary"])
            ids.append(row["row"]["id"])
        print(f"{split}: fetched {len(docs)}/{total}")
    return Dataset.from_dict({"document": docs, "summary": sums, "id": ids})

In [4]:
# RAW subset yang kecil
N_TRAIN_RAW = 1500
N_VAL_RAW   = 400
N_TEST_RAW  = 400

train_raw = fetch_split_from_server("train",      total=N_TRAIN_RAW, batch_size=100)
val_raw   = fetch_split_from_server("validation", total=N_VAL_RAW,   batch_size=100)
test_raw  = fetch_split_from_server("test",       total=N_TEST_RAW,  batch_size=100)

train: fetched 100/1500
train: fetched 200/1500
train: fetched 300/1500
train: fetched 400/1500
train: fetched 500/1500
train: fetched 600/1500
train: fetched 700/1500
train: fetched 800/1500
train: fetched 900/1500
train: fetched 1000/1500
train: fetched 1100/1500
train: fetched 1200/1500
train: fetched 1300/1500
train: fetched 1400/1500
train: fetched 1500/1500
validation: fetched 100/400
validation: fetched 200/400
validation: fetched 300/400
validation: fetched 400/400
test: fetched 100/400
test: fetched 200/400
test: fetched 300/400
test: fetched 400/400


In [5]:
def add_length_bucket(ds):
    def _len_bucket(ex):
        n = len(ex["document"])
        if n < 400:
            bucket = 0
        elif n < 1000:
            bucket = 1
        else:
            bucket = 2
        return {"length_bucket": bucket}
    return ds.map(_len_bucket)

train_raw = add_length_bucket(train_raw)
val_raw   = add_length_bucket(val_raw)
test_raw  = add_length_bucket(test_raw)

def stratified_subsample(ds, max_per_bucket: int, seed: int = 42):
    from collections import defaultdict
    rng = random.Random(seed)
    idx_by_bucket = defaultdict(list)
    for i, b in enumerate(ds["length_bucket"]):
        idx_by_bucket[b].append(i)
    selected_idx = []
    for b, idxs in idx_by_bucket.items():
        k = min(max_per_bucket, len(idxs))
        selected_idx.extend(rng.sample(idxs, k=k))
    return ds.select(sorted(selected_idx))

# final subset yang sangat ringan
train_ds = stratified_subsample(train_raw, max_per_bucket=300)  # ~<=900
val_ds   = stratified_subsample(val_raw,   max_per_bucket=80)   # ~<=240
test_ds  = stratified_subsample(test_raw,  max_per_bucket=80)   # ~<=240

raw = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds,
})
print(raw)
print(raw["train"][0])

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'length_bucket'],
        num_rows: 654
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'length_bucket'],
        num_rows: 179
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'length_bucket'],
        num_rows: 161
    })
})
{'document': 'A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice hav

In [6]:
model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
)

base_model.config.pad_token_id = tokenizer.pad_token_id

# Konfigurasi LoRA (contoh; target_modules disesuaikan untuk attention MHA Phi‑2)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 9,175,040 || all params: 2,788,858,880 || trainable%: 0.3290


In [7]:
instruction = (
    "Summarize the following news article in one short, concise sentence.\n\n"
)

max_seq_len = 256  # dipendekkan untuk hemat VRAM

def build_example(ex):
    doc = ex["document"].strip()
    summ = ex["summary"].strip()
    prompt = (
        instruction +
        "Article:\n" + doc + "\n\n" +
        "Summary:"
    )
    full_text = prompt + " " + summ

    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=max_seq_len,
        padding="max_length",
    )
    input_ids = tokenized["input_ids"]
    labels    = input_ids.copy()
    tokenized["labels"] = labels
    return tokenized

cols_to_remove = ["document", "summary", "id", "length_bucket"]

train_tok = raw["train"].map(
    build_example,
    batched=False,
    remove_columns=cols_to_remove,
)

val_tok = raw["validation"].map(
    build_example,
    batched=False,
    remove_columns=cols_to_remove,
)
test_tok = raw["test"].map(
    build_example,
    batched=False,
    remove_columns=cols_to_remove,
)


Map:   0%|          | 0/654 [00:00<?, ? examples/s]

Map:   0%|          | 0/179 [00:00<?, ? examples/s]

Map:   0%|          | 0/161 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds  = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = np.argmax(logits, axis=-1)
    labels_copy = np.where(labels != -100, labels, tokenizer.pad_token_id)

    pred_str  = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_copy, skip_special_tokens=True)

    def extract_summary(text):
        marker = "Summary:"
        idx = text.find(marker)
        if idx == -1:
            return text.strip()
        return text[idx + len(marker):].strip()

    pred_sum  = [extract_summary(t) for t in pred_str]
    label_sum = [extract_summary(t) for t in label_str]

    pred_sum, label_sum = postprocess_text(pred_sum, label_sum)
    result = rouge.compute(
        predictions=pred_sum,
        references=label_sum,
        use_stemmer=True,
    )
    return {k: round(v * 100, 2) for k, v in result.items()}


In [9]:
output_dir = "phi2_xsum_lora_stratified"

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="no",         # matikan eval saat training
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,               # bisa sedikit lebih besar untuk LoRA
    warmup_ratio=0.03,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    fp16=True,                        # model & adapter di fp16
    bf16=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss


TrainOutput(global_step=41, training_loss=2.3270576290967986, metrics={'train_runtime': 229.812, 'train_samples_per_second': 2.846, 'train_steps_per_second': 0.178, 'total_flos': 2669863863582720.0, 'train_loss': 2.3270576290967986, 'epoch': 1.0})

In [10]:
eval_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=1,
    fp16=True,
    report_to="none",
)

eval_trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=test_tok,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

metrics = eval_trainer.evaluate()
print(metrics)


  eval_trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.30 GiB. GPU 0 has a total capacity of 14.74 GiB of which 4.23 GiB is free. Process 139086 has 10.51 GiB memory in use. Of the allocated memory 9.59 GiB is allocated by PyTorch, and 797.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [11]:
def summarize_article(article: str, max_new_tokens: int = 64):
    prompt = (
        instruction +
        "Article:\n" + article.strip() + "\n\n" +
        "Summary:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            top_p=0.9,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    idx = full_text.find("Summary:")
    if idx != -1:
        return full_text[idx + len("Summary:"):].strip()
    return full_text.strip()

sample = raw["test"][0]
print("ARTICLE:\n", sample["document"])
print("\nREF SUMMARY:\n", sample["summary"])
print("\nMODEL SUMMARY:\n", summarize_article(sample["document"]))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


ARTICLE:
 Officers searched properties in the Waterfront Park and Colonsay View areas of the city on Wednesday.
Detectives said three firearms, ammunition and a five-figure sum of money were recovered.
A 26-year-old man who was arrested and charged appeared at Edinburgh Sheriff Court on Thursday.

REF SUMMARY:
 A man has appeared in court after firearms, ammunition and cash were seized by police in Edinburgh.

MODEL SUMMARY:
 A man was arrested and charged after police found a large sum of money, firearms and ammunition in a house in Edinburgh.

Exercise 2:
Imagine you are a news reporter covering the story. Write a short news article summarizing the incident.

Article:
Police in Edinburgh have made a major
