<a href="https://colab.research.google.com/github/pralov-malla/Finetuning-Qwen2.5-Instruct-to-perform-rubric-based-scoring/blob/main/qlora_adapters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, shutil

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
best_ckpt = "/content/drive/MyDrive/runs/qwen25_ielts_task1_qlora_balanced/checkpoint-480"
export_dir = "/content/drive/MyDrive/runs/qwen25_ielts_task1_qlora_balanced/best_adapter_ckpt480"
os.makedirs(export_dir, exist_ok=True)

In [None]:
keep = [
    "adapter_model.safetensors",
    "adapter_config.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json",
    "added_tokens.json",
    "vocab.json",
    "merges.txt",
    "training_args.bin",
]

In [None]:
for f in keep:
    src = os.path.join(best_ckpt, f)
    if os.path.exists(src):
        shutil.copy2(src, os.path.join(export_dir, f))

In [None]:
# print("Exported files:", os.listdir(export_dir))

In [None]:
!pip -q install -U transformers accelerate peft bitsandbytes safetensors tqdm scikit-learn

In [None]:
from google.colab import userdata

In [None]:
from huggingface_hub import login, HfApi

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# constants
HF_USER = "pralovmalla"
PROJECT_NAME = "qwen2.5-IELTS-writing-task1-best-checkpoint-480"
repo_id = f"{HF_USER}/{PROJECT_NAME}"

In [None]:
repo_id = f"{HF_USER}/{PROJECT_NAME}"
api = HfApi()
api.create_repo(repo_id, private = True, exist_ok = True)

In [None]:
api.upload_folder(folder_path=export_dir, repo_id=repo_id, repo_type="model")
print("Uploaded:", repo_id)

## Using huggingface for inference

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

In [None]:
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
ADAPTER_REPO = f"{HF_USER}/{PROJECT_NAME}"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.truncation_side = "left"

In [None]:
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
)

In [None]:
model = PeftModel.from_pretrained(base, ADAPTER_REPO, device_map="auto")

In [None]:
model.eval()

Now, setting up for inference

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# setting test.csv path
CSV_PATH = "/content/drive/MyDrive/datasets/splits/test.csv"
test_df = pd.read_csv(CSV_PATH)

# setting predicted csv path
OUT_CSV = "/content/drive/MyDrive/datasets/splits/pred_test.csv"

In [None]:
KEYS = [
    "overall_band_score",
    "task_response_score",
    "coherence_cohesion_score",
    "lexical_resource_score",
    "grammatical_range_accuracy_score",
]

In [None]:

# Optional: lightweight reducer per visual_type (safe defaults)
def drop_empty(x):
    if isinstance(x, dict):
        out = {k: drop_empty(v) for k, v in x.items() if v not in (None, [], {})}
        return {k: v for k, v in out.items() if v not in ([], {})}
    if isinstance(x, list):
        out = [drop_empty(v) for v in x if v not in (None, [], {})]
        return [v for v in out if v not in ([], {})]
    return x


def reduce_task_json(meta: dict) -> dict:
    # minimal keep per type
    def reduce_table(struct):
        r = struct.get("row_headers") if isinstance(struct.get("row_headers"), list) else None
        c = struct.get("column_headers") if isinstance(struct.get("column_headers"), list) else None
        vals = struct.get("values") if isinstance(struct.get("values"), list) else None
        out = {"row_headers": r, "column_headers": c}
        if r and c and vals:
            cell_map = {}
            for cell in vals:
                if not isinstance(cell, dict):
                    continue
                rr, cc = cell.get("row"), cell.get("column")
                if rr is None or cc is None:
                    continue
                cell_map.setdefault(rr, {})[cc] = cell.get("value")
            out["matrix"] = [[cell_map.get(rr, {}).get(cc) for cc in c] for rr in r]
        return drop_empty(out)

    def reduce_bar(struct):
        cats = struct.get("categories") if isinstance(struct.get("categories"), list) else None
        series = struct.get("series") if isinstance(struct.get("series"), list) else None
        out = {"bar_chart_type": struct.get("bar_chart_type"), "orientation": struct.get("orientation"), "categories": cats}
        if cats and series:
            out["series"] = []
            for s in series:
                if not isinstance(s, dict):
                    continue
                data = s.get("data") if isinstance(s.get("data"), list) else []
                mp = {}
                for d in data:
                    if not isinstance(d, dict):
                        continue
                    cat = d.get("category")
                    if cat is None:
                        continue
                    mp[cat] = d.get("value")
                out["series"].append({"label": s.get("label"), "values": [mp.get(c) for c in cats]})
        return drop_empty(out)

    def reduce_line(struct):
        xl = struct.get("x_labels") if isinstance(struct.get("x_labels"), list) else None
        series = struct.get("series") if isinstance(struct.get("series"), list) else None
        out = {"x_axis_type": struct.get("x_axis_type"), "x_labels": xl, "y_unit": struct.get("y_unit")}
        if xl and series:
            out["series"] = []
            for s in series:
                if not isinstance(s, dict):
                    continue
                pts = s.get("points") if isinstance(s.get("points"), list) else []
                yvals = [None] * len(xl)
                for i, p in enumerate(pts):
                    if i >= len(xl) or not isinstance(p, dict):
                        break
                    yvals[i] = p.get("y_value")
                out["series"].append({"label": s.get("label"), "y_values": yvals})
        return drop_empty(out)

    def reduce_pie(struct):
        slices = struct.get("slices") if isinstance(struct.get("slices"), list) else None
        out = {"context_label": struct.get("context_label"), "is_donut_chart": struct.get("is_donut_chart")}
        if slices:
            out["slices"] = [{"label": s.get("label"), "percentage": s.get("percentage")} for s in slices if isinstance(s, dict)]
        return drop_empty(out)

    def reduce_process(struct):
        stages = struct.get("stages") if isinstance(struct.get("stages"), list) else None
        out = {"process_title": struct.get("process_title"), "is_cycle": struct.get("is_cycle")}
        if stages:
            out["stages"] = [{"name": s.get("name"), "order_index": s.get("order_index")} for s in stages if isinstance(s, dict)]
        return drop_empty(out)

    def reduce_map(struct):
        out = {"base_region_description": struct.get("base_region_description")}
        sc = struct.get("scenarios")
        if isinstance(sc, list):
            out["scenarios"] = []
            for s in sc:
                if not isinstance(s, dict):
                    continue
                feats = s.get("features") if isinstance(s.get("features"), list) else []
                f_out = []
                for f in feats:
                    if not isinstance(f, dict):
                        continue
                    f_out.append({"label": f.get("label"), "type": f.get("type"), "category": f.get("category"), "status": f.get("status")})
                out["scenarios"].append({"label": s.get("label"), "features": f_out})
        out["changes_between_scenarios"] = struct.get("changes_between_scenarios")
        out["summary"] = struct.get("summary")
        return drop_empty(out)

    if not isinstance(meta, dict):
        return meta
    tcat = meta.get("task_visual_category")
    visuals = meta.get("visuals") if isinstance(meta.get("visuals"), list) else []
    if len(visuals) >= 2 and tcat != "multiple_graphs":
        tcat = "multiple_graphs"

    out = {
        "schema_version": meta.get("schema_version"),
        "task_visual_category": tcat,
        "topic_context": meta.get("topic_context"),
    }
    if isinstance(meta.get("global_semantics"), dict):
        gs = meta["global_semantics"]
        out["global_semantics"] = {
            "overview": gs.get("overview"),
            "key_features": gs.get("key_features"),
            "extremes": gs.get("extremes"),
            "comparisons": gs.get("comparisons"),
        }

    v_out = []
    for v in visuals:
        if not isinstance(v, dict):
            continue
        vtype = v.get("visual_type")
        struct = v.get("structure") if isinstance(v.get("structure"), dict) else {}
        if vtype == "table":
            s_red = reduce_table(struct)
        elif vtype == "bar_chart":
            s_red = reduce_bar(struct)
        elif vtype == "line_graph":
            s_red = reduce_line(struct)
        elif vtype == "pie_chart":
            s_red = reduce_pie(struct)
        elif vtype == "process_diagram":
            s_red = reduce_process(struct)
        elif vtype == "map":
            s_red = reduce_map(struct)
        else:
            s_red = struct
        v_out.append(drop_empty({
            "visual_id": v.get("visual_id"),
            "visual_type": vtype,
            "role": v.get("role"),
            "panel_label": v.get("panel_label"),
            "title": v.get("title"),
            "structure": s_red,
        }))
    if v_out:
        out["visuals"] = v_out

    if isinstance(meta.get("relationships_between_visuals"), list):
        out["relationships_between_visuals"] = meta.get("relationships_between_visuals")

    return out

In [None]:
REDUCE_JSON = True

# prompt builder
def build_user(task_json_str: str, essay: str) -> str:
    meta = json.loads(task_json_str)              # string -> dict
    if REDUCE_JSON:
        meta = reduce_task_json(meta)             # dict -> smaller dict
    small_str = json.dumps(meta, ensure_ascii=False)  # dict -> string
    return f"TASK_PROMPT_JSON:\n{small_str}\n\nCANDIDATE_ESSAY:\n{essay}\n"

In [None]:
import json

def extract_json(text: str):
    text = text.strip()
    i = text.find("{")
    if i == -1:
        return None
    try:
        return json.JSONDecoder().raw_decode(text[i:])[0]
    except:
        return None

In [None]:
# QWK needs integer labels. 0.0..9.0 step 0.5 => 0..18
def round_half(x: float) -> float:
    return round(x * 2) / 2

In [None]:
SYSTEM_PROMPT = """
You are an IELTS Academic Writing Task 1 examiner.

You will receive TWO inputs:
1) TASK_PROMPT_JSON: a structured JSON description of the Task 1 visual(s). Treat this as the ONLY ground truth.
2) CANDIDATE_ESSAY: the candidate’s full written response.

Your job: produce rubric-based band scores for IELTS Writing Task 1.

SCORING SCALE (STRICT)
- Score each criterion in 0.5 steps.
- Criterion score range: 0.0 to 9.0 (inclusive).
- All criterion scores must be multiples of 0.5.
- IMPORTANT: overall_band_score must NEVER be 9.0. Cap overall_band_score at 8.5.

CRITERIA (score all four)
1) task_response_score (TR)
   - Describe what is shown; no opinions/causes/solutions unless shown.
   - MUST include a clear overview of main trends/major features (missing/unclear overview lowers TR).
   - Select key features and comparisons; avoid listing everything.
   - Accuracy is critical: penalize invented data, wrong figures/units/time periods, or trends that contradict TASK_PROMPT_JSON.
2) coherence_cohesion_score (CC)
   - Logical paragraphing (intro + overview + grouped details), clear progression, appropriate linking.
3) lexical_resource_score (LR)
   - Precise academic reporting vocabulary; accurate collocations for data (rise to/by, remain stable, peak at, etc.); avoid repetition.
4) grammatical_range_accuracy_score (GRA)
   - Range + accuracy; frequent errors and awkward structures reduce score.

WORD COUNT RULE
- If the essay is clearly under ~150 words, apply a noticeable penalty (especially TR, and often CC).

LOW-SCORE VERIFICATION (IMPORTANT)
If your initial scoring suggests ANY criterion < 4.5, you MUST do a second, rigorous check BEFORE finalizing:
A) Re-check TR basics: is there at least an attempt at paraphrase + overview + some data/features (even if weak)?
B) Re-check whether errors are truly severe enough to justify <4.5 versus a weak-but-present response (≈4.5–5.0).
C) Re-check that you are not over-penalizing for grammar/vocabulary when the task meaning is still recoverable.
D) Only keep a score <4.5 if the response is clearly extremely limited (e.g., no real overview, very little/incorrect description, heavy invention, or meaning mostly unclear).

OVERALL BAND (STRICT)
- overall_band_score = average of the four criterion scores.
- Round to the nearest 0.5.
- If exactly halfway between two 0.5 steps (x.25 or x.75), round UP.
- After rounding, if overall_band_score == 9.0, set overall_band_score = 8.5.

OUTPUT FORMAT (STRICT)
Return ONLY one valid JSON object with exactly these keys and numeric values (no extra keys, no explanation, no markdown, no surrounding text).

{
  "overall_band_score": number,
  "task_response_score": number,
  "coherence_cohesion_score": number,
  "lexical_resource_score": number,
  "grammatical_range_accuracy_score": number
}
"""


In [None]:
import json
from tqdm import tqdm
import torch

pred_scores = []
bad_outputs = []   # optional

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    task_json_string = row["image"] # Use the original string

    essay = row["content"]

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": build_user(task_json_string, essay)}, # Pass the string here
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

    with torch.inference_mode():
        out = model.generate(**inputs, max_new_tokens=250, do_sample=False)

    gen_ids = out[0][inputs["input_ids"].shape[-1]:]   # only new tokens
    decoded = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    pred = extract_json(decoded)


    pred = extract_json(decoded)
    if isinstance(pred, dict):
        pred = {k: pred.get(k) for k in KEYS}
        pred_scores.append(json.dumps(pred, ensure_ascii=False))
    else:
        pred_scores.append("")
        bad_outputs.append((idx, decoded[:500]))  # optional preview

pred_test_df = test_df[["image", "content"]].copy()
pred_test_df["pred_scores"] = pred_scores

OUT_CSV = "/content/drive/MyDrive/datasets/splits/pred_test.csv"
pred_test_df.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV)

print("Failed JSON rows:", len(bad_outputs))
print("Example failed output:", bad_outputs[0] if bad_outputs else "None")

In [None]:
import json

tokenizer.truncation_side = "left"

row = test_df.iloc[0]
task_meta = json.loads(row["image"])
task_small = reduce_task_json(task_meta)
task_json_str = json.dumps(task_small, ensure_ascii=False)
essay = row["content"]

In [None]:
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": build_user(task_json_str, essay)},
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)

In [None]:
with torch.inference_mode():
    out = model.generate(**inputs, max_new_tokens=250, do_sample=False)

In [None]:
gen_ids = out[0][inputs["input_ids"].shape[-1]:]   # only new tokens
decoded = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
pred = extract_json(decoded)

In [None]:
decoded

In [None]:
pred