In [None]:
from google.colab import files
uploaded = files.upload()

Saving rubrics_simple_fixed.csv to rubrics_simple_fixed.csv
Saving train.csv to train.csv


In [None]:

TRAIN_PATH   = "/content/train.csv"
RUBRIC_PATH  = "/content/rubrics_simple_fixed.csv"
OUT_PATH     = "/content/scored_outputs_preview.csv"
MODEL_NAME   = "Qwen/Qwen3-1.7B"

SUBSET_N     = 24
BATCH_NO     = 1         # 无思考批量
BATCH_TH     = 1        # 有思考批量

!pip install bitsandbytes
import os, re, math, random, pandas as pd, numpy as np, torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes

def norm_q(x): return str(x).strip().lower()

#读数据 & 强制读取现有
assert os.path.exists(RUBRIC_PATH), f"Rubric file not found: {RUBRIC_PATH}"
train_df   = pd.read_csv(TRAIN_PATH)
rubrics_df = pd.read_csv(RUBRIC_PATH)

# 标准化列名（内部统一用小写）
train = train_df.rename(columns={
    "Question":"question","Student Answer":"answer","output_label":"score"
}).copy()
train["q_norm"] = train["question"].map(norm_q)

if "q_norm" not in rubrics_df.columns:
    rubrics_df["q_norm"] = rubrics_df["question"].map(norm_q)

rub_small = rubrics_df[["q_norm","score2","score1","score0"]].dropna()
merged = train.merge(rub_small, on="q_norm", how="inner")
print("可评分样本数(全量):", len(merged))
assert len(merged)>0, "合并后没有样本，请检查 question 和 rubric 是否对应。"

#  只抽取少量样本预览  SAMPLE SMALL SUBSET FOR PREVIEW
random.seed(42)
preview = merged.sample(min(SUBSET_N, len(merged)), random_state=42).reset_index(drop=True)
print("本次预览条数:", len(preview))

#  模型加载（GPU、左补齐）
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, padding_side="left", trust_remote_code=True
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    load_in_4bit=True,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)

model.config.pad_token_id = tokenizer.pad_token_id
print("cuda available:", torch.cuda.is_available())

# 生成函数（按“批”显示进度）  BATCHED GENERATION FUNCTION
def batched_generate(msgs, batch_size=8, max_new_tokens=16,
                     do_sample=False, temperature=0.0, top_p=1.0,
                     input_max_len=1024):
    outs = []
    num_batches = math.ceil(len(msgs)/batch_size)
    for bi in tqdm(range(num_batches), desc="Generating (batches)"):
        batch = msgs[bi*batch_size:(bi+1)*batch_size]
        texts = [
            tokenizer.apply_chat_template(m, tokenize=False, add_generation_prompt=True)
            for m in batch
        ]
        enc = tokenizer(texts, return_tensors="pt", padding=True,
                        truncation=True, max_length=input_max_len).to(model.device)
        with torch.no_grad():
            out = model.generate(
                **enc, max_new_tokens=max_new_tokens,
                do_sample=do_sample, temperature=temperature, top_p=top_p,
                pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id
            )
        outs.extend(tokenizer.batch_decode(out, skip_special_tokens=True))
        if torch.cuda.is_available(): torch.cuda.empty_cache()
    return outs

#  打分提示词 & 构造消息（会用到 rubric 的 R2/R1/R0）PROMPT TEMPLATES
NO_THINK_PROMPT = """Question: "{QUESTION}"

Rubric:
Score 2: {R2}
Score 1: {R1}
Score 0: {R0}

Student Response: "{RESP}"

Output only the final score as a single number: 0 or 1 or 2. No extra text."""

THINK_PROMPT = """Question: "{QUESTION}"

Rubric:
Score 2: {R2}
Score 1: {R1}
Score 0: {R0}

Student Response: "{RESP}"

Think step by step which rubric conditions are met.
On the last line output exactly:
SCORE: <0/1/2>"""
# BUILD MESSAGE LISTS (with or without thinking)
def build_scoring_msgs(df: pd.DataFrame, thinking: bool):
    msgs, P = [], (THINK_PROMPT if thinking else NO_THINK_PROMPT)
    for _, row in df.iterrows():
        q   = str(row["question"])[:300]
        r2  = str(row["score2"]).split(".")[0][:300]
        r1  = str(row["score1"]).split(".")[0][:300]
        r0  = str(row["score0"]).split(".")[0][:300]
        ans = str(row["answer"])[:400]
        prompt = P.format(QUESTION=q, R2=r2, R1=r1, R0=r0, RESP=ans)
        msgs.append([
            {"role":"system","content":"You are a strict, deterministic grader."},
            {"role":"user",  "content": prompt},
        ])
    return msgs

def parse_score(text: str, thinking: bool):
    if not isinstance(text, str): return None
    tail = text[-200:]
    if thinking:
        m = re.findall(r"SCORE\s*:\s*([012])\b", tail, flags=re.I)
        return int(m[-1]) if m else None
    else:
        m = re.findall(r"(?:^|\D)\b([012])\b(?:\D|$)", tail)
        return int(m[-1]) if m else None

msgs_no = build_scoring_msgs(preview, thinking=False)
msgs_th = build_scoring_msgs(preview, thinking=True)
print("msgs_no:", len(msgs_no), "msgs_th:", len(msgs_th))
print("示例prompt(无思考)片段：\n", msgs_no[0][1]["content"][:300])

# 生成与解析 GENERATION & PARSING
pred_no = batched_generate(msgs_no, batch_size=BATCH_NO, max_new_tokens=8,
                           do_sample=False, temperature=0.0, top_p=1.0)
pred_th = batched_generate(msgs_th, batch_size=BATCH_TH, max_new_tokens=48,
                           do_sample=False, temperature=0.0, top_p=1.0)

scores_no = [parse_score(t, False) for t in pred_no]
scores_th = [parse_score(t, True ) for t in pred_th]

# 评估 & 导出 EVALUATION & SAVE RESULTS
out_df = preview.copy()
out_df["pred_no_think"] = scores_no
out_df["pred_think"]    = scores_th

acc_no = (out_df["pred_no_think"]==out_df["score"]).mean()
acc_th = (out_df["pred_think"]   ==out_df["score"]).mean()
print(f"Acc (no thinking) = {acc_no:.3f} | Acc (thinking) = {acc_th:.3f} | Δ = {acc_th-acc_no:+.3f}")

out_df.to_csv(OUT_PATH, index=False)
print("Saved preview scores ->", OUT_PATH)


Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1
可评分样本数(全量): 3662
本次预览条数: 24


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

cuda available: True
msgs_no: 24 msgs_th: 24
示例prompt(无思考)片段：
 Question: "To segment the rose petals [4 marks]:      "

Rubric:
Score 2: Clearly and specifically addresses the key requirement by covering narrow range around 0 360 degress, all primary colors pure colors, and broad range around 0
Score 1: Covers some key elements (at least 2 of: narrow range arou


Generating (batches):   0%|          | 0/24 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating (batches): 100%|██████████| 24/24 [00:16<00:00,  1.42it/s]
Generating (batches): 100%|██████████| 24/24 [00:43<00:00,  1.80s/it]

Acc (no thinking) = 0.333 | Acc (thinking) = 0.125 | Δ = -0.208
Saved preview scores -> /content/scored_outputs_preview.csv





In [None]:

import pandas as pd
from datetime import datetime

out_df["gen_no"] = pred_no
out_df["gen_th"] = pred_th

# 宽表（每题一行、两列预测）已经保存到 OUT_PATH
out_df.to_csv(OUT_PATH, index=False)

#  构造“长表”，每题两行：一行 no-think，一行 think#
meta = {
    "model_name": MODEL_NAME,
    "subset_n": SUBSET_N,
    "batch_no": BATCH_NO,
    "batch_th": BATCH_TH,
    "temperature": 0.0,
    "top_p": 1.0,
    "ts_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
}
# NO-THINKING MODE ROWS
long_no = out_df[[
    "question","answer","score","score2","score1","score0","q_norm",
    "pred_no_think","gen_no"
]].copy()
long_no["mode"] = "no_think"
long_no["cot_enabled"] = False
long_no = long_no.rename(columns={
    "pred_no_think":"pred",
    "gen_no":"gen_text"
})
# THINKING MODE ROWS
long_th = out_df[[
    "question","answer","score","score2","score1","score0","q_norm",
    "pred_think","gen_th"
]].copy()
long_th["mode"] = "think"
long_th["cot_enabled"] = True
long_th = long_th.rename(columns={
    "pred_think":"pred",
    "gen_th":"gen_text"
})

long_df = pd.concat([long_no, long_th], ignore_index=True)

# 附上统一的实验元数据列
for k, v in meta.items():
    long_df[k] = v

OUT_PATH_LONG = OUT_PATH.replace(".csv", "_long.csv")
long_df.to_csv(OUT_PATH_LONG, index=False)
print(" Saved long-format with mode flag ->", OUT_PATH_LONG)


 Saved long-format with mode flag -> /content/scored_outputs_preview_long.csv


  "ts_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",


In [None]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np

df = pd.read_csv("/content/scored_outputs_preview_long.csv")
assert {"score","pred","mode"}.issubset(df.columns)

# 确保类型正确
df["score"] = pd.to_numeric(df["score"], errors="coerce")
df["pred"]  = pd.to_numeric(df["pred"],  errors="coerce")

rows = []
diag = []  # 诊断信息：每种模式里 NaN 的数量

for mode, g in df.groupby("mode"):
    n_total = len(g)
    n_valid = g["pred"].notna().sum()
    snb = n_valid / n_total if n_total else np.nan  # Score Non-Blank

    # 只在“有预测”的行上计算其他指标
    gg = g[g["pred"].notna()].copy()

    if len(gg) == 0:
        rows.append({
            "mode": mode,
            "n_total": n_total,
            "n_valid": 0,
            "SNB": snb,
            "Accuracy": np.nan,
            "UWK": np.nan,
            "SND": np.nan,
        })
        diag.append({"mode": mode, "nan_in_pred": n_total})
        continue

    # 转 int
    y_true = gg["score"].astype(int)
    y_pred = gg["pred"].astype(int)

    acc = (y_pred == y_true).mean()
    snd = (np.abs(y_pred - y_true) <= 1).mean()
    uwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")

    rows.append({
        "mode": mode,
        "n_total": n_total,
        "n_valid": len(gg),
        "SNB": snb,
        "Accuracy": acc,
        "UWK": uwk,
        "SND": snd,
    })

    # 诊断：看看哪些行是 NaN（便于排错）
    diag.append({
        "mode": mode,
        "nan_in_pred": int((g["pred"].isna()).sum())
    })

res_df = pd.DataFrame(rows).sort_values("mode")
diag_df = pd.DataFrame(diag).sort_values("mode")
print("Metrics summary:")
display(res_df)
print("\nDiagnostics (NaN counts):")
display(diag_df)



Metrics summary:


Unnamed: 0,mode,n_total,n_valid,SNB,Accuracy,UWK,SND
0,no_think,24,24,1.0,0.333333,-0.081081,0.666667
1,think,24,16,0.666667,0.1875,-0.037037,0.6875



Diagnostics (NaN counts):


Unnamed: 0,mode,nan_in_pred
0,no_think,0
1,think,8
