<a href="https://colab.research.google.com/github/shuyu-M/Chain_of_Thought/blob/main/Lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


# 1. Files to upload before running
# Need to upload the following files to the working directory (same folder as this notebook):
#
# - rubric_simple_fixed.csv
# - scored_outputs_preview_long.csv
# The fine-tuning code will merge these files and save a clean version as:
#     finetune_data_clean.csv
#
# 2. Fine-tuning part
# The fine-tuning section (LoRA training) should be run on their GPU machine.
# It trains for 20 epochs and saves the LoRA adapter weights automatically to:
#     outputs/qwen3-lora-awe/
#
# 3. Inference + Evaluation
# After training, you can run the inference section to generate predictions
# and save them to:
#     pred_qwen3_like_single_all.csv
#


In [5]:
from google.colab import files
files.upload()  # 手动上传 CSV 文件



Saving finetune_data_clean.csv to finetune_data_clean.csv
Saving pred_qwen3_like_single_all.csv to pred_qwen3_like_single_all (2).csv


{'finetune_data_clean.csv': b'prompt,completion\n"Question: To segment the rose petals [4 marks]:      \n\nRubric:\n**Score 2 (Correct):** Clearly and specifically addresses the key requirement by covering narrow range around 0 360 degress, all primary colors pure colors, and broad range around 0.5 with accurate details.\n**Score 1 (Partial):** Covers some key elements (at least 2 of: narrow range around 0 360 degress, all primary colors pure colors, and broad range around 0.5) but lacks clarity or misses essential detail.\n**Score 0 (Incorrect):** Does not address the key requirement or fails to mention narrow range around 0 360 degress, all primary colors pure colors, and broad range around 0.5.\n\nStudent Response: A narrow range around 0/360 degree for H (due to red color of petals). A broad range of I centered around 0.5. A broad range of S centered around 0.5.\n\nPlease assign a score (0,1,2) and briefly explain your reasoning.","Score: 2\nFeedback: "\n"Question: Two      advanta

In [27]:

import os, re, glob
import pandas as pd
# Helper function to locate files within /mnt/data
def find_file(name):
    for p in [f"./{name}", f"/mnt/data/{name}"]:
        if os.path.exists(p): return p
    hits = glob.glob(f"**/{name}", recursive=True)
    return hits[0] if hits else None

rubric_path = find_file("rubrics_simple_fixed.csv")
score_path  = find_file("scored_outputs_preview_long.csv")
assert rubric_path and score_path, f"找不到文件：{rubric_path=}, {score_path=}"
print("✅ Found:", rubric_path, score_path)

rubric_df = pd.read_csv(rubric_path)
score_df  = pd.read_csv(score_path)

# Verify required columns
need_rubric_cols = {"question","score2","score1","score0"}
need_score_cols  = {"question","answer"}
assert need_rubric_cols.issubset(rubric_df.columns), rubric_df.columns
assert need_score_cols.issubset(score_df.columns), score_df.columns

# Merge rubric and score dataframes by question
merged = score_df.merge(rubric_df, on="question", how="left", suffixes=("", "_rubric"))

# Combine rubric text for score levels 0/1/2 into one paragraph
def colpick(df, base):
    for c in (f"{base}_rubric", base):
        if c in df.columns: return df[c].fillna("").astype(str)
    return ""

merged["Rubric_Text"] = (
    "**Score 2 (Correct):** " + colpick(merged, "score2") + "\n" +
    "**Score 1 (Partial):** " + colpick(merged, "score1") + "\n" +
    "**Score 0 (Incorrect):** " + colpick(merged, "score0")
)


# Determine the gold score for each item (prefer human 'score', fallback to 'pred')
if "score" in merged.columns:
    merged["Final_Score"] = merged["score"]
else:
    merged["Final_Score"] = None

if "pred" in merged.columns:
    merged["Final_Score"] = merged["Final_Score"].where(merged["Final_Score"].notna(), merged["pred"])

merged = merged[merged["Final_Score"].notna()].copy()
merged["Final_Score"] = merged["Final_Score"].astype(float).round().astype(int).clip(0,2)

# Construct prompt text used for fine-tuning
merged["prompt"] = (
    "Question: " + merged["question"].astype(str) + "\n\n"
    "Rubric:\n" + merged["Rubric_Text"] + "\n\n"
    "Student Response: " + merged["answer"].astype(str) + "\n\n"
    "Please assign a score (0,1,2) and briefly explain your reasoning."
)

# Regular expressions for cleaning system or think tags
SYSTEM_PAT  = re.compile(r"(?i)(You are a strict.*|Output only.*|No extra.*)")
THINK_BLOCK = re.compile(r"<think>.*?</think>", re.S | re.I)
PROMPT_PAT  = re.compile(r"(?is)(Question:|Rubric:|Student Response:).*")  # 截断误拼的prompt尾巴
TAG_PAT     = re.compile(r"(?i)^(assistant|system)\s*:?\s*")
# Function to extract clean "Score" and "Feedback" from model output
def clean_completion(raw, gold_score):
    txt = (raw or "")
    txt = THINK_BLOCK.sub("", txt)
    txt = SYSTEM_PAT.sub("", txt)
    txt = PROMPT_PAT.sub("", txt)
    txt = TAG_PAT.sub("", txt)
    lines = [l.strip() for l in txt.splitlines() if l.strip()]
    score_line = next((l for l in lines if l.lower().startswith("score")),  None)
    feed_line  = next((l for l in lines if l.lower().startswith("feedback")), None)
    if not score_line:
        score_line = f"Score: {int(gold_score)}"
    else:
        m = re.search(r"(-?\d+)", score_line)
        score_line = f"Score: {min(2,max(0,int(m.group(1))))}" if m else f"Score: {int(gold_score)}"
    if not feed_line:
        feed_line = "Feedback: "
    words = feed_line.split()
    if len(words) > 120:
        feed_line = " ".join(words[:120]) + " …"
    return score_line + "\n" + feed_line

# Clean generated completions
gen_text = merged["gen_text"] if "gen_text" in merged.columns else pd.Series([""] * len(merged))
def _to_two_lines(raw, gold):
    try:
        return clean_completion(raw, gold)
    except NameError:
        fb = (raw or "").strip().splitlines()[0] if isinstance(raw, str) else ""
        fb = fb[:250]
        return f"Score: {int(gold)}\nFeedback: {fb}"

merged["completion"] = [
    _to_two_lines(gen_text.iloc[i], int(merged["Final_Score"].iloc[i]))
    for i in range(len(merged))
]
ft_cols = [c for c in ["prompt", "completion"] if c in merged.columns]
ft = merged[ft_cols].dropna().copy() if len(ft_cols) == 2 else pd.DataFrame(columns=["prompt","completion"])
ft.to_csv("finetune_data_clean.csv", index=False)
print("saved: finetune_data_clean.csv | rows:", len(ft))




✅ Found: ./rubrics_simple_fixed.csv ./scored_outputs_preview_long.csv
✅ saved: finetune_data_clean.csv | rows: 212


In [None]:
pip install -U transformers==4.40.2 trl==0.9.4 peft==0.10.0 accelerate bitsandbytes datasets


Collecting transformers==4.40.2
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/138.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.9.4
  Downloading trl-0.9.4-py3-none-any.whl.metadata (11 kB)
Collecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tyro>=0.5.11 (from trl==0.9.4)

In [None]:
pip install -U "transformers>=4.46.2" "trl>=0.9.6" "peft>=0.11.1" "accelerate>=0.33.0" "tokenizers>=0.20.1" bitsandbytes datasets


Collecting transformers>=4.46.2
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl>=0.9.6
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting peft>=0.11.1
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers>=0.20.1
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?2

In [None]:
pip install -U "trl==0.9.6" "transformers>=4.46.2" "peft>=0.11.1" "tokenizers>=0.20.1" accelerate bitsandbytes datasets


Collecting trl==0.9.6
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting numpy<2.0.0,>=1.18.2 (from trl==0.9.6)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading trl-0.9.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, trl
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
  Attempting uninstall: trl
    Found existing installation: tr

In [None]:


import inspect
# Define a text formatting function — must return a list of strings
def fmt(ex):
    p = ex["prompt"]
    c = ex["completion"]
    return [f"{p}\n{c}"]

# Configure the SFT training parameters
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=20,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=200,
    report_to=[]
)

# Dynamically check SFTTrainer's signature for compatibility
sig = inspect.signature(SFTTrainer.__init__)
use_processing = "processing_class" in sig.parameters
use_args = "args" in sig.parameters
supports_text_field = "dataset_text_field" in sig.parameters
supports_max_seq = "max_seq_length" in sig.parameters

# Build trainer configuration dictionary
trainer_kwargs = dict(
    model=model,
    train_dataset=ds["train"],
    peft_config=lora_cfg,
)


# Attach tokenizer or processing class depending on trainer version
if use_processing:
    trainer_kwargs["processing_class"] = tokenizer
else:
    trainer_kwargs["tokenizer"] = tokenizer

# Add the training configuration
if use_args:
    trainer_kwargs["args"] = sft_config
else:
    trainer_kwargs["config"] = sft_config

# Pass the text formatting function
if supports_text_field:
    trainer_kwargs["formatting_func"] = fmt
else:
    trainer_kwargs["formatting_func"] = fmt

if supports_max_seq:
    trainer_kwargs["max_seq_length"] = 1024


trainer = SFTTrainer(**trainer_kwargs)
trainer.train()

# Save the LoRA adapter and tokenizer
os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f" LoRA adapter saved to {OUTPUT_DIR}")





Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/212 [00:00<?, ? examples/s]

  super().__init__(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Step,Training Loss


✅ LoRA adapter saved to outputs/qwen3-lora-awe


In [None]:

base_model = "Qwen/Qwen3-1.7B"
adapter_dir = "outputs/qwen3-lora-awe"

import torch, gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Clear CUDA cache to prevent out-of-memory errors
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

bnb_cfg = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False, trust_remote_code=True)

base = AutoModelForCausalLM.from_pretrained(
    base_model,
    trust_remote_code=True,
    quantization_config=bnb_cfg,
    device_map="auto"
)

# Set padding and EOS tokens if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id
base.config.eos_token_id = tokenizer.eos_token_id

# Load and merge the LoRA adapter with the base model
model = PeftModel.from_pretrained(base, adapter_dir)
model = model.merge_and_unload()
model.eval()

print(f"Inference model ready with LoRA from: {adapter_dir}")

# Example prompt for inference
prompt = """Question: What are the advantages of using a lazy allocation policy in an OS?
Student Response: It avoids allocating memory until it is actually needed, reducing waste.
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.inference_mode():
    out = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=False,
    )

print(tokenizer.decode(out[0], skip_special_tokens=True))




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Inference model ready with LoRA from: outputs/qwen3-lora-awe
Question: What are the advantages of using a lazy allocation policy in an OS?
Student Response: It avoids allocating memory until it is actually needed, reducing waste.
Correct Response: It reduces the number of page faults by allowing the page to be loaded into memory only when it is needed.
The student response is correct. The correct response is also correct. The student response is correct. The correct response is also correct. The student response is correct. The correct response is also correct. The student response is correct. The correct response is also correct. The student response is correct. The correct response is also correct. The student response is correct. The correct response is also correct. The student response is correct. The correct response is also correct. The student response is correct. The correct response is also correct


In [None]:

import torch, re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

BASE_MODEL = "Qwen/Qwen3-1.7B"
ADAPTER_DIR = "outputs/qwen3-lora-awe"

# Function to load the LoRA-fine-tuned model
def load_lora_model(
    base=BASE_MODEL,
    adapter=ADAPTER_DIR,
    load_in_8bit=True,
):
    if load_in_8bit:
        bnb = BitsAndBytesConfig(load_in_8bit=True)
    else:
        bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
    # Load tokenizer and base model
    tok = AutoTokenizer.from_pretrained(base, use_fast=False, trust_remote_code=True)
    mdl = AutoModelForCausalLM.from_pretrained(
        base,
        quantization_config=bnb,
        trust_remote_code=True,
        device_map="auto",
    )

    # Ensure pad / eos tokens are defined
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    mdl.config.pad_token_id = tok.pad_token_id
    mdl.config.eos_token_id = tok.eos_token_id

   # Load LoRA adapter and merge into main model
    mdl = PeftModel.from_pretrained(mdl, adapter)
    mdl = mdl.merge_and_unload()
    mdl.eval()
    return tok, mdl

# Load tokenizer and model once
tokenizer, model = load_lora_model()
print("LoRA model is ready.")

# Instruction header given to the model before every prompt
STRICT_HEADER = (
    "You are a grading assistant.\n"
    "Output exactly TWO lines, nothing else.\n"
    "Line 1: Score: <0 or 1 or 2>\n"
    "Line 2: Feedback: <one concise sentence>\n"
    "Begin output now:\n"
)


# Convert any model output to a clean two-line response
def normalize_to_two_lines(text: str):
    text = re.sub(r"\s+", " ", text).strip()
    m1 = re.search(r"Score\s*:\s*(\-?\d+)", text, re.I)
    score = max(0, min(2, int(m1.group(1)))) if m1 else 1
    m2 = re.search(r"Feedback\s*:\s*(.+)", text, re.I|re.S)
    fb = m2.group(1).strip() if m2 else ""
    if not fb:
        fb = {2:"Correct and clearly justified.",
              1:"Partially correct; missing key detail.",
              0:"Incorrect or unsupported."}.get(score, "")
    return f"Score: {score}\nFeedback: {fb}", score



# Single-prompt inference with retry for malformed output
def predict_one(raw_prompt: str, max_new_tokens=120, retry=True):
    full = STRICT_HEADER + "\n" + raw_prompt.strip()
    inputs = tokenizer(full, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.0)
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    gen = text[len(full):].strip()
    gen_ids = model.generate(
    inputs,
    max_new_tokens=32,
    do_sample=False,
    temperature=0.0,
    top_p=1.0,
    repetition_penalty=1.0,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

    # Retry if model forgot to include "Score:"
    if "Score:" not in gen and retry:
        follow = "\nReturn ONLY:\nScore: <0/1/2>\nFeedback: <one sentence>"
        inputs = tokenizer(full + follow, return_tensors="pt").to(model.device)
        out = model.generate(**inputs, max_new_tokens=80, do_sample=False, temperature=0.0)
        text = tokenizer.decode(out[0], skip_special_tokens=True)
        gen = text[len(full + follow):].strip()

    gen_clean, score = normalize_to_two_lines(gen)
    return gen_clean, score


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



✅ LoRA model is ready.


In [50]:
# Inference cell: hard-format, deterministic decode, robust parsing
import re
import numpy as np

# Ensure tokenizer and model have padding tokens set
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None:
    model.config.pad_token_id = tokenizer.pad_token_id

# Select inference mode (either "Thinking" or "Non-thinking")
MODE = globals().get("MODE", "Non-thinking")


# Instruction header provided to the model before every scoring task
STRICT_HEADER = (
    "You are a strict grading assistant.\n"
    "You must output exactly TWO lines:\n"
    "Line 1: Score: <0 or 1 or 2>\n"
    "Line 2: Feedback: <one concise sentence>\n\n"
    "Scoring rules:\n"
    "- Score 2: Answer is fully correct, covers all rubric requirements clearly.\n"
    "- Score 1: Partially correct OR missing some key details.\n"
    "- Score 0: Incorrect, irrelevant, or fails to address the question.\n"
    "Be especially strict—if unsure, choose a LOWER score.\n"
    "Begin output now:\n"
)

# Generation hyperparameters for stable, deterministic decoding
GEN_KW = dict(
    max_new_tokens=64,
    do_sample=False,
    temperature=0.0,
    top_p=1.0,
    repetition_penalty=1.0,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    early_stopping=True
)


# Extract numeric score from text
def parse_score(text: str) -> int:
    m = re.search(r"[012]", str(text))
    return int(m.group()) if m else 0

# Convert raw text to “Score + Feedback” two-line format
def to_two_lines(raw_text: str) -> str:
    t = (raw_text or "").strip()
    m_begin = re.search(r"Begin output now\s*:?", t, flags=re.I)
    if m_begin:
        t = t[m_begin.end():].strip()

    m_scores = list(re.finditer(r"Score\s*:\s*([012])", t, flags=re.I))
    m_feeds  = list(re.finditer(r"Feedback\s*:\s*(.+)", t, flags=re.I | re.S))

    s  = int(m_scores[-1].group(1)) if m_scores else 0
    fb = m_feeds[-1].group(1).strip() if m_feeds else t.strip()

    fb = re.sub(r"\s+", " ", fb)
    words = fb.split()
    if len(words) > 60:
        fb = " ".join(words[:60]) + " …"

    return f"Score: {s}\nFeedback: {fb}"

# Final cleanup for punctuation and grammar
def finalize_feedback(two_lines: str) -> str:
    lines = two_lines.splitlines()
    if len(lines) < 2:
        return two_lines

    fb = lines[1]
    fb = re.sub(r"\s+", " ", fb)
    m = list(re.finditer(r"[\.!\?。！？]", fb))
    if m:
        fb = fb[:m[-1].end()]
    fb = re.sub(r"(?:\b(or|and|but)\b|[,;:（\[{\-])\s*$", "", fb, flags=re.I)
    if not re.search(r"[\.!\?。！？]$", fb):
        fb = fb + "."

    return f"{lines[0]}\n{fb}"

# Construct final prompt (with rubric, question, and answer)
def build_prompt(question: str, rubric_summary: str, answer: str, mode: str = MODE) -> str:
    return (
        f"Mode: {mode}\n"
        f"Question: {question}\n"
        f"Rubric (summary): {rubric_summary}\n"
        f"Student Response: {answer}\n\n"
        + STRICT_HEADER
    )

# Single-example prediction
def predict_one(sample, rubric_summary: str):
    """
    sample: dict，至少包含 {"question":..., "answer":...}
    rubric_summary: 一句或两句rubric要点（建议精炼为 2-3 条规则）
    返回: {"raw":原始解码,"clean":两行化,"score":0/1/2}
    """
    prompt = build_prompt(sample["question"], rubric_summary, sample["answer"])

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        gen_ids = model.generate(**inputs, **GEN_KW)

    raw = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    clean = finalize_feedback(to_two_lines(raw))
    return {"raw": raw, "clean": clean, "score": parse_score(clean)}

# Example run
rubric_summary = "1) 必含要点A与B；2) 缺A或B→1分；3) 事实错误/跑题→0分。"
sample = {
    "question": "What are the advantages of using a lazy allocation policy in an OS?",
    "answer":   "It avoids allocating memory until it is actually needed, reducing waste."
}

out = predict_one(sample, rubric_summary)
print(out["clean"])
print("Parsed score:", out["score"])



Score: 1
Feedback: The response correctly identifies one advantage of lazy allocation, but fails to mention the other key advantage (e.g., reduced overhead or faster allocation). It also lacks a clear explanation of why this advantage is important in an OS context.
Parsed score: 1


In [63]:
import re

def parse_qra_from_prompt(p: str):
    """
    从一条 prompt 文本中抽取:
      - Question:  ...   在 'Rubric:' 之前
      - Rubric:    ...   在 'Student Response:' 之前
      - Answer:    ...   到文本结尾
    任一抽不到返回空串。
    """
    if not isinstance(p, str):
        return "", "", ""

    mq = re.search(r"Question\s*:\s*(.*?)(?:\n\s*Rubric\s*:)", p, flags=re.S|re.I)
    mr = re.search(r"Rubric\s*:\s*(.*?)(?:\n\s*Student\s*Response\s*:)", p, flags=re.S|re.I)
    ma = re.search(r"Student\s*Response\s*:\s*(.*)\Z", p, flags=re.S|re.I)

    q = mq.group(1).strip() if mq else ""
    r = mr.group(1).strip() if mr else ""
    a = ma.group(1).strip() if ma else ""

    # 折叠多余空白，避免超长&乱序
    def norm(s): return re.sub(r"\s+", " ", s)
    return norm(q), norm(r), norm(a)


In [66]:
import pandas as pd
from tqdm.auto import tqdm

# Neutral instruction header (lighter than STRICT_HEADER; avoids heavy constraints)
NEUTRAL_HEADER = (
    "You are a grading assistant.\n"
    "Output exactly two lines:\n"
    "Score: <0 or 1 or 2>\n"
    "Feedback: <one concise sentence>\n"
    "Begin output now:\n"
)

def build_prompt_full(q: str, r: str, a: str, mode: str = "Non-thinking"):
    """
    Compose the full prompt using Question + trimmed Rubric + Answer.
    Rubric is trimmed to avoid exceeding context length.
    """
    rubric_trim = r if len(rubric_text := r) < 1200 else rubric_text[:1200] + " …"
    return (
        f"[Mode: {mode}]\n"
        f"Question: {q}\n"
        f"Rubric: {rubric_trim}\n"
        f"Student Response: {a}\n\n"
        f"{NEUTRAL_HEADER}"
    )

def predict_one_from_prompt(p: str):
    """
    Accept a *stored* prompt string. Parse out (Question, Rubric, Answer),
    rebuild a clean prompt (to normalize headers), then run deterministic decode.
    """
    q, r, a = parse_qra_from_prompt(p)
    rubric_summary = r
    if not (q and r and a):
        return predict_one({"question": q or "", "answer": a or ""}, rubric_summary)

    prompt = build_prompt_full(q, r, a, mode=globals().get("MODE", "Non-thinking"))
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=64,
            do_sample=False,
            temperature=0.0,
            top_p=1.0,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            early_stopping=True
        )

    raw = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    clean = finalize_feedback(to_two_lines(raw))  # your normalizers
    score = parse_score(clean)
    return {"prompt": p, "pred_text": clean, "pred_score": int(score)}

def batch_predict_like_single(
    input_csv="finetune_data_clean.csv",
    out_csv="pred_qwen3_like_single.csv",
    n=None
):
    """
    Batch scoring that mirrors the single-example path:
    - reads prompts from CSV (column 'prompt')
    - rebuilds normalized full prompt
    - runs deterministic decoding
    - writes pred_text/pred_score to CSV
    """
    df = pd.read_csv(input_csv)
    if n is not None:
        df = df.head(n)

    outs = []
    it = df["prompt"].tolist()
    for i, p in enumerate(tqdm(it, desc="Generating", total=len(it))):
        result = predict_one_from_prompt(p)
        if i < 3:
            print("— SAMPLE —")
            print(result["clean"], "\n", result["score"])
        outs.append({"prompt": p, "pred_text": result["clean"], "pred_score": int(result["score"])})

    out_df = pd.DataFrame(outs)
    out_df.to_csv(out_csv, index=False)
    print(f"Saved {len(out_df)} predictions → {out_csv}")
    print("Pred score distribution:", out_df["pred_score"].value_counts().sort_index().to_dict())

# Example: run first 5 only
batch_predict_like_single(n=5, out_csv="pred_qwen3_like_single_5.csv")
# Full run:
batch_predict_like_single(out_csv="pred_qwen3_like_single_all.csv")


Generating:   0%|          | 0/212 [00:00<?, ?it/s]

—— SAMPLE ——
Score: 2
Feedback: The student correctly identified that the narrow range around 0-360 degrees covers all primary colors (red, blue, yellow) and the broad ranges around 0.5 for I and S, which are likely the secondary colors (green and orange). 
score= 2
—— SAMPLE ——
Score: 1
Feedback: The student response partially addresses the key requirements by mentioning readability and global variables, but lacks clarity and misses essential details about the advantages of separating declaration from definition, such as modularity and abstraction separation. The explanation is vague and does not clearly connect the concepts to the benefits of code organization. 
score= 1
—— SAMPLE ——
Score: 2
Feedback: The user process does not have the capability to initiate a context switch because the scheduler is in kernel space and the kernel is responsible for scheduling processes. The user process cannot directly manipulate the scheduler to switch processes, as the scheduler is not accessible 

In [6]:
import pandas as pd
import re
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.metrics import confusion_matrix

# Read gold and predicted results
gold = pd.read_csv("finetune_data_clean.csv")
pred = pd.read_csv("pred_qwen3_like_single_all.csv")
# Normalize prompt keys
def norm_prompt(s):
    if not isinstance(s, str): return ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

gold["prompt_key"] = gold["prompt"].map(norm_prompt)
pred["prompt_key"] = pred["prompt"].map(norm_prompt)

# Extract numeric gold scores from completion text
def extract_score(s):
    m = re.search(r"Score\s*:\s*([012])", str(s))
    return int(m.group(1)) if m else None

gold["true_score"] = gold["completion"].map(extract_score)

# Merge gold and prediction tables using prompt_key
df = pd.merge(
    gold[["prompt_key","true_score"]],
    pred[["prompt_key","pred_score"]],
    on="prompt_key",
    how="inner"
).dropna()

# Prepare numeric arrays for evaluation
y_true = df["true_score"].astype(int).clip(0,2).values
y_pred = df["pred_score"].astype(int).clip(0,2).values

print("Aligned pairs:", len(df))
print("Label distribution (y_true):", pd.Series(y_true).value_counts().to_dict())
print("Pred  distribution (y_pred):", pd.Series(y_pred).value_counts().to_dict())


# Compute performance metrics
acc  = accuracy_score(y_true, y_pred)
kap  = cohen_kappa_score(y_true, y_pred)
qwk  = cohen_kappa_score(y_true, y_pred, weights="quadratic")

print(f"Accuracy     : {acc:.4f}")
print(f"Cohen's kappa: {kap:.4f}")
print(f"QWK          : {qwk:.4f}")
print(df.head(3))

confusion_matrix(y_true,y_pred)

Aligned pairs: 424
Label distribution (y_true): {2: 172, 1: 168, 0: 84}
Pred  distribution (y_pred): {2: 188, 1: 132, 0: 104}
Accuracy     : 0.4906
Cohen's kappa: 0.2141
QWK          : 0.2750
                                          prompt_key  true_score  pred_score
0  Question: To segment the rose petals [4 marks]...           2           2
1  Question: To segment the rose petals [4 marks]...           2           2
2  Question: Two advantages of separating declara...           1           1


array([[ 40,  12,  32],
       [ 44,  68,  56],
       [ 20,  52, 100]])