<a href="https://colab.research.google.com/github/pralov-malla/Finetuning-Qwen2.5-Instruct-to-perform-rubric-based-scoring/blob/main/open_source_model_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip installs

!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib wandb

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import torch, json
from datasets import load_dataset
from torch.utils.data import DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import wandb
from google.colab import userdata
from huggingface_hub import login
from tqdm import tqdm
from datetime import datetime


pd.set_option('display.max_colwidth', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/datasets/splits/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/datasets/splits/val.csv')
test_df = pd.read_csv('/content/drive/MyDrive/datasets/splits/test.csv')

In [None]:
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("No GPU! Go to Runtime > Change runtime type > GPU")

GPU: NVIDIA L4
VRAM: 23.80 GB


Keeping only necessary columns

In [None]:
KEEP_COLS = ["image_id","image","content","scores_json"]

train_df = train_df[KEEP_COLS]
val_df = val_df[KEEP_COLS]
test_df = test_df[KEEP_COLS]

building a function to reduce the JSON according the visual_type

In [None]:
REDUCE_JSON = True  # toggle to compact the task JSON to save tokens

# Optional: lightweight reducer per visual_type (safe defaults)
def drop_empty(x):
    if isinstance(x, dict):
        out = {k: drop_empty(v) for k, v in x.items() if v not in (None, [], {})}
        return {k: v for k, v in out.items() if v not in ([], {})}
    if isinstance(x, list):
        out = [drop_empty(v) for v in x if v not in (None, [], {})]
        return [v for v in out if v not in ([], {})]
    return x


def reduce_task_json(meta: dict) -> dict:
    # minimal keep per type
    def reduce_table(struct):
        r = struct.get("row_headers") if isinstance(struct.get("row_headers"), list) else None
        c = struct.get("column_headers") if isinstance(struct.get("column_headers"), list) else None
        vals = struct.get("values") if isinstance(struct.get("values"), list) else None
        out = {"row_headers": r, "column_headers": c}
        if r and c and vals:
            cell_map = {}
            for cell in vals:
                if not isinstance(cell, dict):
                    continue
                rr, cc = cell.get("row"), cell.get("column")
                if rr is None or cc is None:
                    continue
                cell_map.setdefault(rr, {})[cc] = cell.get("value")
            out["matrix"] = [[cell_map.get(rr, {}).get(cc) for cc in c] for rr in r]
        return drop_empty(out)

    def reduce_bar(struct):
        cats = struct.get("categories") if isinstance(struct.get("categories"), list) else None
        series = struct.get("series") if isinstance(struct.get("series"), list) else None
        out = {"bar_chart_type": struct.get("bar_chart_type"), "orientation": struct.get("orientation"), "categories": cats}
        if cats and series:
            out["series"] = []
            for s in series:
                if not isinstance(s, dict):
                    continue
                data = s.get("data") if isinstance(s.get("data"), list) else []
                mp = {}
                for d in data:
                    if not isinstance(d, dict):
                        continue
                    cat = d.get("category")
                    if cat is None:
                        continue
                    mp[cat] = d.get("value")
                out["series"].append({"label": s.get("label"), "values": [mp.get(c) for c in cats]})
        return drop_empty(out)

    def reduce_line(struct):
        xl = struct.get("x_labels") if isinstance(struct.get("x_labels"), list) else None
        series = struct.get("series") if isinstance(struct.get("series"), list) else None
        out = {"x_axis_type": struct.get("x_axis_type"), "x_labels": xl, "y_unit": struct.get("y_unit")}
        if xl and series:
            out["series"] = []
            for s in series:
                if not isinstance(s, dict):
                    continue
                pts = s.get("points") if isinstance(s.get("points"), list) else []
                yvals = [None] * len(xl)
                for i, p in enumerate(pts):
                    if i >= len(xl) or not isinstance(p, dict):
                        break
                    yvals[i] = p.get("y_value")
                out["series"].append({"label": s.get("label"), "y_values": yvals})
        return drop_empty(out)

    def reduce_pie(struct):
        slices = struct.get("slices") if isinstance(struct.get("slices"), list) else None
        out = {"context_label": struct.get("context_label"), "is_donut_chart": struct.get("is_donut_chart")}
        if slices:
            out["slices"] = [{"label": s.get("label"), "percentage": s.get("percentage")} for s in slices if isinstance(s, dict)]
        return drop_empty(out)

    def reduce_process(struct):
        stages = struct.get("stages") if isinstance(struct.get("stages"), list) else None
        out = {"process_title": struct.get("process_title"), "is_cycle": struct.get("is_cycle")}
        if stages:
            out["stages"] = [{"name": s.get("name"), "order_index": s.get("order_index")} for s in stages if isinstance(s, dict)]
        return drop_empty(out)

    def reduce_map(struct):
        out = {"base_region_description": struct.get("base_region_description")}
        sc = struct.get("scenarios")
        if isinstance(sc, list):
            out["scenarios"] = []
            for s in sc:
                if not isinstance(s, dict):
                    continue
                feats = s.get("features") if isinstance(s.get("features"), list) else []
                f_out = []
                for f in feats:
                    if not isinstance(f, dict):
                        continue
                    f_out.append({"label": f.get("label"), "type": f.get("type"), "category": f.get("category"), "status": f.get("status")})
                out["scenarios"].append({"label": s.get("label"), "features": f_out})
        out["changes_between_scenarios"] = struct.get("changes_between_scenarios")
        out["summary"] = struct.get("summary")
        return drop_empty(out)

    if not isinstance(meta, dict):
        return meta
    tcat = meta.get("task_visual_category")
    visuals = meta.get("visuals") if isinstance(meta.get("visuals"), list) else []
    if len(visuals) >= 2 and tcat != "multiple_graphs":
        tcat = "multiple_graphs"

    out = {
        "schema_version": meta.get("schema_version"),
        "task_visual_category": tcat,
        "topic_context": meta.get("topic_context"),
    }
    if isinstance(meta.get("global_semantics"), dict):
        gs = meta["global_semantics"]
        out["global_semantics"] = {
            "overview": gs.get("overview"),
            "key_features": gs.get("key_features"),
            "extremes": gs.get("extremes"),
            "comparisons": gs.get("comparisons"),
        }

    v_out = []
    for v in visuals:
        if not isinstance(v, dict):
            continue
        vtype = v.get("visual_type")
        struct = v.get("structure") if isinstance(v.get("structure"), dict) else {}
        if vtype == "table":
            s_red = reduce_table(struct)
        elif vtype == "bar_chart":
            s_red = reduce_bar(struct)
        elif vtype == "line_graph":
            s_red = reduce_line(struct)
        elif vtype == "pie_chart":
            s_red = reduce_pie(struct)
        elif vtype == "process_diagram":
            s_red = reduce_process(struct)
        elif vtype == "map":
            s_red = reduce_map(struct)
        else:
            s_red = struct
        v_out.append(drop_empty({
            "visual_id": v.get("visual_id"),
            "visual_type": vtype,
            "role": v.get("role"),
            "panel_label": v.get("panel_label"),
            "title": v.get("title"),
            "structure": s_red,
        }))
    if v_out:
        out["visuals"] = v_out

    if isinstance(meta.get("relationships_between_visuals"), list):
        out["relationships_between_visuals"] = meta.get("relationships_between_visuals")

    return out

Building train/val/test JSONL in chat format for SFT

In [None]:
train_df.shape

(6044, 4)

In [None]:
SYSTEM_PROMPT = """
You are an IELTS Academic Writing Task 1 examiner.

You will receive TWO inputs:
1) TASK_PROMPT_JSON: a structured JSON description of the Task 1 visual(s). Treat this as the ONLY ground truth.
2) CANDIDATE_ESSAY: the candidate’s full written response.

Your job: produce rubric-based band scores for IELTS Writing Task 1.

SCORING SCALE (STRICT)
- Score each criterion in 0.5 steps.
- Criterion score range: 0.0 to 9.0 (inclusive).
- All criterion scores must be multiples of 0.5.
- IMPORTANT: overall_band_score must NEVER be 9.0. Cap overall_band_score at 8.5.

CRITERIA (score all four)
1) task_response_score (TR)
   - Describe what is shown; no opinions/causes/solutions unless shown.
   - MUST include a clear overview of main trends/major features (missing/unclear overview lowers TR).
   - Select key features and comparisons; avoid listing everything.
   - Accuracy is critical: penalize invented data, wrong figures/units/time periods, or trends that contradict TASK_PROMPT_JSON.
2) coherence_cohesion_score (CC)
   - Logical paragraphing (intro + overview + grouped details), clear progression, appropriate linking.
3) lexical_resource_score (LR)
   - Precise academic reporting vocabulary; accurate collocations for data (rise to/by, remain stable, peak at, etc.); avoid repetition.
4) grammatical_range_accuracy_score (GRA)
   - Range + accuracy; frequent errors and awkward structures reduce score.

WORD COUNT RULE
- If the essay is clearly under ~150 words, apply a noticeable penalty (especially TR, and often CC).

LOW-SCORE VERIFICATION (IMPORTANT)
If your initial scoring suggests ANY criterion < 4.5, you MUST do a second, rigorous check BEFORE finalizing:
A) Re-check TR basics: is there at least an attempt at paraphrase + overview + some data/features (even if weak)?
B) Re-check whether errors are truly severe enough to justify <4.5 versus a weak-but-present response (≈4.5–5.0).
C) Re-check that you are not over-penalizing for grammar/vocabulary when the task meaning is still recoverable.
D) Only keep a score <4.5 if the response is clearly extremely limited (e.g., no real overview, very little/incorrect description, heavy invention, or meaning mostly unclear).

OVERALL BAND (STRICT)
- overall_band_score = average of the four criterion scores.
- Round to the nearest 0.5.
- If exactly halfway between two 0.5 steps (x.25 or x.75), round UP.
- After rounding, if overall_band_score == 9.0, set overall_band_score = 8.5.

OUTPUT FORMAT (STRICT)
Return ONLY one valid JSON object with exactly these keys and numeric values (no extra keys, no explanation, no markdown, no surrounding text).

{
  "overall_band_score": number,
  "task_response_score": number,
  "coherence_cohesion_score": number,
  "lexical_resource_score": number,
  "grammatical_range_accuracy_score": number
}
"""


In [None]:
def to_record(row):
    task_obj_dict = json.loads(row["image"])
    if REDUCE_JSON:
        task_json_for_user_payload = reduce_task_json(task_obj_dict)
    else:
        task_json_for_user_payload = task_obj_dict

    user_payload = {"task_json": task_json_for_user_payload, "student_essay": row["content"]}
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": json.dumps(user_payload, ensure_ascii=False)},
            {"role": "assistant", "content": row["scores_json"]},
        ]
    }

In [None]:
from pathlib import Path

DRIVE_ROOT = Path("/content/drive/MyDrive")

# datasets/ is directly inside MyDrive:
PROJECT_ROOT = DRIVE_ROOT

SPLITS_DIR = PROJECT_ROOT / "datasets" / "splits"
OUT_DIR = SPLITS_DIR / "jsonl_chat"
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR_RUN = PROJECT_ROOT / "runs" / "qwen25_ielts_task1_qlora_balanced"
OUT_DIR_RUN.mkdir(parents=True, exist_ok=True)

In [None]:
print("CWD:", Path.cwd())
print("PROJECT_ROOT:", PROJECT_ROOT)
print("OUT_DIR:", OUT_DIR)
print("OUT_DIR_RUN:", OUT_DIR_RUN)

CWD: /content
PROJECT_ROOT: /content/drive/MyDrive
OUT_DIR: /content/drive/MyDrive/datasets/splits/jsonl_chat
OUT_DIR_RUN: /content/drive/MyDrive/runs/qwen25_ielts_task1_qlora_balanced


In [None]:
def write_jsonl(df, path: Path):
    with path.open("w", encoding="utf-8") as f:
        for _, r in df.iterrows():
            f.write(json.dumps(to_record(r), ensure_ascii=False) + "\n")
    print(f"Wrote {len(df)} -> {path}")

In [None]:
TRAIN_JSONL = OUT_DIR / "train.jsonl"
VAL_JSONL   = OUT_DIR / "val.jsonl"
TEST_JSONL  = OUT_DIR / "test.jsonl"

In [None]:
OUT_DIR.mkdir(parents=True, exist_ok=True)
write_jsonl(train_df, TRAIN_JSONL)
write_jsonl(val_df, VAL_JSONL)
write_jsonl(test_df, TEST_JSONL)

Wrote 6044 -> /content/drive/MyDrive/datasets/splits/jsonl_chat/train.jsonl
Wrote 768 -> /content/drive/MyDrive/datasets/splits/jsonl_chat/val.jsonl
Wrote 765 -> /content/drive/MyDrive/datasets/splits/jsonl_chat/test.jsonl


Checking token size of full `image` JSON, and `reduced image` JSON

In [None]:
import numpy as np
import json

sample = train_df.sample(n=min(500, len(train_df)), random_state=42)
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)

def payload(row, reduce=False):
    # Parse image JSON string to a Python dict
    task_obj_dict = json.loads(row["image"])

    # Handle reduction if requested
    if reduce:
        task_json_for_user_payload = reduce_task_json(task_obj_dict)
    else:
        task_json_for_user_payload = task_obj_dict

    # student_essay is already a string
    student_essay = row["content"]

    # Parse scores_json string to a Python dict
    scores_dict = json.loads(row["scores_json"])

    # The user content for the chat template
    user_content_dict = {"task_json": task_json_for_user_payload, "student_essay": student_essay}
    user_content_str = json.dumps(user_content_dict, ensure_ascii=False)

    # The messages list for apply_chat_template
    msgs = [
      {"role":"system","content": SYSTEM_PROMPT},
      {"role":"user","content": user_content_str},
      {"role":"assistant","content": json.dumps(scores_dict, ensure_ascii=False)},
    ]
    # `apply_chat_template` with `tokenize=True` returns a list of token IDs directly
    return tokenizer.apply_chat_template(msgs, tokenize=True, add_generation_prompt=False)

lens_full = [len(payload(r, reduce=False)) for _, r in sample.iterrows()]
print("p95 full:", int(np.percentile(lens_full, 95)))
lens_red = [len(payload(r, reduce=True)) for _, r in sample.iterrows()]
print("p95 reduced:", int(np.percentile(lens_red, 95)))

p95 full: 3350
p95 reduced: 2384


Building dataset and balancing weights


In [None]:
MAX_SEQ_LEN = 3500
SEED = 42

In [None]:
ds = load_dataset(
    "json",
    data_files={"train": str(TRAIN_JSONL), "validation": str(VAL_JSONL)},
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 6044
    })
    validation: Dataset({
        features: ['messages'],
        num_rows: 768
    })
})

In [None]:
ds['train'][0]

{'messages': [{'role': 'system',
   'content': '\nYou are an IELTS Academic Writing Task 1 examiner.\n\nYou will receive TWO inputs:\n1) TASK_PROMPT_JSON: a structured JSON description of the Task 1 visual(s). Treat this as the ONLY ground truth.\n2) CANDIDATE_ESSAY: the candidate’s full written response.\n\nYour job: produce rubric-based band scores for IELTS Writing Task 1.\n\nSCORING SCALE (STRICT)\n- Score each criterion in 0.5 steps.\n- Criterion score range: 0.0 to 9.0 (inclusive).\n- All criterion scores must be multiples of 0.5.\n- IMPORTANT: overall_band_score must NEVER be 9.0. Cap overall_band_score at 8.5.\n\nCRITERIA (score all four)\n1) task_response_score (TR)\n   - Describe what is shown; no opinions/causes/solutions unless shown.\n   - MUST include a clear overview of main trends/major features (missing/unclear overview lowers TR).\n   - Select key features and comparisons; avoid listing everything.\n   - Accuracy is critical: penalize invented data, wrong figures/unit

In [None]:
import json
print(json.dumps(ds["train"][0], indent=2))


{
  "messages": [
    {
      "role": "system",
      "content": "\nYou are an IELTS Academic Writing Task 1 examiner.\n\nYou will receive TWO inputs:\n1) TASK_PROMPT_JSON: a structured JSON description of the Task 1 visual(s). Treat this as the ONLY ground truth.\n2) CANDIDATE_ESSAY: the candidate\u2019s full written response.\n\nYour job: produce rubric-based band scores for IELTS Writing Task 1.\n\nSCORING SCALE (STRICT)\n- Score each criterion in 0.5 steps.\n- Criterion score range: 0.0 to 9.0 (inclusive).\n- All criterion scores must be multiples of 0.5.\n- IMPORTANT: overall_band_score must NEVER be 9.0. Cap overall_band_score at 8.5.\n\nCRITERIA (score all four)\n1) task_response_score (TR)\n   - Describe what is shown; no opinions/causes/solutions unless shown.\n   - MUST include a clear overview of main trends/major features (missing/unclear overview lowers TR).\n   - Select key features and comparisons; avoid listing everything.\n   - Accuracy is critical: penalize invented d

In [None]:
def formatting_func(example):
    return tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False,
    )

In [None]:
formatted = formatting_func(ds["train"][0])

In [None]:
print(formatted)

<|im_start|>system

You are an IELTS Academic Writing Task 1 examiner.

You will receive TWO inputs:
1) TASK_PROMPT_JSON: a structured JSON description of the Task 1 visual(s). Treat this as the ONLY ground truth.
2) CANDIDATE_ESSAY: the candidate’s full written response.

Your job: produce rubric-based band scores for IELTS Writing Task 1.

SCORING SCALE (STRICT)
- Score each criterion in 0.5 steps.
- Criterion score range: 0.0 to 9.0 (inclusive).
- All criterion scores must be multiples of 0.5.
- IMPORTANT: overall_band_score must NEVER be 9.0. Cap overall_band_score at 8.5.

CRITERIA (score all four)
1) task_response_score (TR)
   - Describe what is shown; no opinions/causes/solutions unless shown.
   - MUST include a clear overview of main trends/major features (missing/unclear overview lowers TR).
   - Select key features and comparisons; avoid listing everything.
   - Accuracy is critical: penalize invented data, wrong figures/units/time periods, or trends that contradict TASK_PR

# QLoRA model Setup

In [None]:
compute_dtype = (
    torch.bfloat16
    if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    else torch.float16
)

In [None]:
compute_dtype

torch.bfloat16

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), 

In [None]:
model.config.use_cache = False  # needed with gradient checkpointing

Trainer Config(baseline, no weights)

In [None]:
# Constants
HF_USER = "pralovmalla"
PROJECT_NAME = "qwen2.5-IELTS-writing-task1"

# Run name for saving the model in the hub
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

LOG_TO_WANDB = True


In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
import wandb
from google.colab import userdata
wandb_api_key = userdata.get('WANDB_API_KEY')
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mpralovmalla[0m ([33mpralovmalla-pralov-personal-education[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
if LOG_TO_WANDB:
  wandb.init(project=PROJECT_NAME, name=RUN_NAME)

In [None]:
import os

os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj"
    ],
)

In [None]:
bf16_ok = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

In [None]:
bf16_ok

True

In [None]:
args = SFTConfig(
    # --- admin / outputs ---
    output_dir=str(OUT_DIR_RUN),          # folder to save checkpoints/adapters
    run_name=RUN_NAME,

    # --- training length ---
    num_train_epochs=2,
    max_steps=-1,

    # --- memory-safe defaults for QLoRA ---
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,

    # --- stability ---
    learning_rate=1e-4,                  # changed from 2e-4
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    weight_decay=0.0,
    max_grad_norm=0.3,

    # --- efficiency ---
    group_by_length=True,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",           # more stable than paged_adamw_8bit for many setups

    # --- precision (L4: bf16, T4: fp16) ---
    bf16=bf16_ok,
    fp16=not bf16_ok,

    # --- sequence length ---
    max_seq_length=MAX_SEQ_LEN,

    # --- logging / eval / saving (so W&B shows charts early) ---
    logging_strategy="steps",
    logging_steps=1,
    logging_first_step=True,

    eval_strategy="steps",
    eval_steps=20,

    save_strategy="steps",
    save_steps=20,
    save_total_limit=2,

    # --- W&B ---
    report_to="wandb" if LOG_TO_WANDB else "none",

    # --- Hub ---
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True,
    hub_strategy="end",
)

Using `Data Collator`  to ensure that the model is trained to predict only the assistant's score JSON, not the whole prompt/content (system + user + visual JSON + essay)

In [None]:
from trl import DataCollatorForCompletionOnlyLM

response_template = "<|im_start|>assistant\n"

collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    args=args,
    peft_config=peft_config,
    formatting_func=formatting_func,   # chat formatting
    data_collator=collator,
)

  trainer = SFTTrainer(


Map:   0%|          | 0/6044 [00:00<?, ? examples/s]

Map:   0%|          | 0/768 [00:00<?, ? examples/s]

In [None]:
# Fine-tune!
trainer.train()

trainer.save_model()  # Saves the fine-tuned LoRA/QLoRA adapter files into: <PROJECT_ROOT>/runs/qwen25_ielts_task1_qlora_balanced
tokenizer.save_pretrained(args.output_dir)
# Push our fine-tuned model to Hugging Face
trainer.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
20,0.1382,0.14032
40,0.1155,0.111092
60,0.0952,0.098817
80,0.1065,0.097201
100,0.1109,0.093679
120,0.0917,0.087864
140,0.0814,0.087024
160,0.0912,0.087988
180,0.0779,0.086072
200,0.107,0.085704


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/runs/qwen25_ielts_task1_qlora_balanced/checkpoint-20)... Done. 0.3s
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/runs/qwen25_ielts_task1_qlora_balanced/checkpoint-40)... Done. 0.3s
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/runs/qwen25_ielts_task1_qlora_balanced/checkpoint-60)... Done. 0.3s
  r

In [None]:
if LOG_TO_WANDB:
  wandb.finish()