In [19]:
!pip install -q sentence-transformers transformers datasets accelerate evaluate scikit-learn

In [20]:
TRAIN_CE = True

MODEL_NAME_CE = "microsoft/deberta-v3-small"

# Training
CE_EPOCHS = 10
CE_BATCH_SIZE = 16
GRAD_ACCUM_STEPS = 4
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# Data
SAMPLE_LIMIT = None
N_SYNTHETIC = 500

# Output & reproducibility
CE_OUTPUT_DIR = "cross_encoder_best"
RANDOM_STATE = 42

# Trainer / perf
FP16 = True
SAVE_TOTAL_LIMIT = 2
EVAL_EVERY_N_EPOCHS = 1


USE_WANDB = False


In [21]:
from google.colab import files
import pandas as pd, numpy as np, re, os, json, warnings, math
warnings.filterwarnings("ignore")
from pathlib import Path
import torch

In [22]:
print("Upload your dataset (CSV/TSV/XLSX).")
uploaded = files.upload()
if len(uploaded) == 0:
    raise RuntimeError("No file uploaded.")
uploaded_filename = list(uploaded.keys())[0]
print("Uploaded:", uploaded_filename)

def load_dataset(path):
    try:
        if path.lower().endswith(".tsv"):
            return pd.read_csv(path, sep="\t", engine="python", encoding="utf-8")
        if path.lower().endswith(".xlsx") or path.lower().endswith(".xls"):
            return pd.read_excel(path)
        return pd.read_csv(path, engine="python", encoding="utf-8")
    except Exception:
        try:
            return pd.read_csv(path, engine="python", encoding="latin-1")
        except Exception:
            return pd.read_csv(path, engine="python", sep=None, encoding="latin-1")

df = load_dataset(uploaded_filename)
print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())

Upload your dataset (CSV/TSV/XLSX).


Saving Mohler_Mihalcea_dataset.csv to Mohler_Mihalcea_dataset.csv
Saving scientsbank_train.csv to scientsbank_train.csv
Saving STS_Benchmark_dataset.csv to STS_Benchmark_dataset.csv
Saving training_set_rel3.tsv to training_set_rel3.tsv
Uploaded: Mohler_Mihalcea_dataset.csv
Loaded shape: (2273, 7)
Columns: ['id', 'question', 'instructor_answer', 'student_answer', 'score_grader_1', 'score_grader_2', 'score_avg']


In [23]:
cols = df.columns.tolist()
cols_lower = [c.lower() for c in cols]

def pick_first(candidates):
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols[cols_lower.index(cand.lower())]
    return None

student_candidates   = ["student_answer","studentresponse","student_response","answer","answer_text","student","response"]
reference_candidates = ["instructor_answer","reference_answer","reference","model_answer","ideal_answer","correct_answer","answer_key"]
label_candidates     = ["score_avg","score_mean","score","label","grade","domain1_score","domain_score","score_grader_1","score_grader_2"]

student_col = pick_first(student_candidates)
ref_col     = pick_first(reference_candidates)
label_col   = pick_first(label_candidates)

# average graders if needed
if label_col is None:
    g1 = pick_first(["score_grader_1","grader1_score","grader_1"])
    g2 = pick_first(["score_grader_2","grader2_score","grader_2"])
    if g1 is not None and g2 is not None:
        df["score_avg"] = pd.to_numeric(df[g1], errors="coerce") + pd.to_numeric(df[g2], errors="coerce")
        df["score_avg"] = df["score_avg"] / 2.0
        label_col = "score_avg"

print("Auto-detected -> student:", student_col, "| reference:", ref_col, "| label:", label_col)
if student_col is None:
    raise RuntimeError("Could not auto-detect student-answer column.")
if label_col is None:
    raise RuntimeError("Could not auto-detect score/label column (or graders to average).")

# build canonical df_small
df_small = df[[student_col, label_col] + ([ref_col] if ref_col is not None else [])].copy()
df_small = df_small.rename(columns={student_col:"student_answer", label_col:"score"})
if ref_col is not None:
    df_small = df_small.rename(columns={ref_col:"reference_answer"})
else:
    df_small["reference_answer"] = ""

# clean & numeric
df_small = df_small.dropna(subset=["student_answer","score"])
df_small["score"] = pd.to_numeric(df_small["score"], errors="coerce")
df_small = df_small.dropna(subset=["score"]).reset_index(drop=True)
print("Prepared data:", df_small.shape)
print(df_small["score"].describe())

# optional small-sample for quick runs
if SAMPLE_LIMIT is not None and SAMPLE_LIMIT > 0 and len(df_small) > SAMPLE_LIMIT:
    df_small = df_small.sample(SAMPLE_LIMIT, random_state=RANDOM_STATE).reset_index(drop=True)
    print(f"Using SAMPLE_LIMIT={SAMPLE_LIMIT} rows for quick runs. New shape:", df_small.shape)


Auto-detected -> student: student_answer | reference: instructor_answer | label: score_avg
Prepared data: (2273, 3)
count    2273.000000
mean        4.183898
std         1.096596
min         0.000000
25%         3.500000
50%         4.500000
75%         5.000000
max         5.000000
Name: score, dtype: float64


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

def clean_text(s):
    return re.sub(r"\s+"," ", str(s).strip()) if pd.notna(s) else ""

df_small["student_answer"] = df_small["student_answer"].apply(clean_text)
df_small["reference_answer"] = df_small["reference_answer"].apply(clean_text)

y = df_small["score"].values
# stratify on limited label sets
unique_scores = np.unique(y)
stratify = y.astype(int) if unique_scores.size <= 10 else None
train_df, val_df = train_test_split(df_small, test_size=0.15, random_state=RANDOM_STATE, stratify=stratify)
print("Train / Val:", len(train_df), len(val_df))

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
tfidf.fit(pd.concat([train_df["student_answer"], train_df["reference_answer"]]).astype(str))

X_train = tfidf.transform(train_df["student_answer"].values)
X_val = tfidf.transform(val_df["student_answer"].values)

reg = Ridge(alpha=1.0).fit(X_train, train_df["score"].values)
val_pred_reg = reg.predict(X_val)

data_min, data_max = float(df_small["score"].min()), float(df_small["score"].max())
print("Score range:", data_min, "->", data_max)


Train / Val: 1932 341
Score range: 0.0 -> 5.0


In [25]:
import numpy as np, pandas as _pd
USE_SBERT = False
sbert_model = None
try:
    from sentence_transformers import SentenceTransformer, util
    try:
        sbert_model = SentenceTransformer("all-mpnet-base-v2")
        USE_SBERT = True
        print(" SBERT loaded")
    except Exception as e:
        print(" SBERT load error; fallback to TF-IDF. Err:", e)
        sbert_model = None
except Exception as e:
    print(" sentence-transformers not installed; fallback to TF-IDF. Err:", e)
    sbert_model = None
    USE_SBERT = False

def compute_similarity_array(students, references, batch_size=64):
    students = ["" if _pd.isna(s) else str(s) for s in students]
    references = ["" if _pd.isna(r) else str(r) for r in references]
    if len(students) != len(references):
        raise ValueError("students and references must have same length")
    if all((r.strip() == "") for r in references):
        sims_raw = np.zeros(len(students), dtype=float)
        return sims_raw.copy(), sims_raw
    if USE_SBERT and sbert_model is not None:
        sims_parts = []
        for i in range(0, len(students), batch_size):
            s_batch = students[i:i+batch_size]
            r_batch = references[i:i+batch_size]
            s_emb = sbert_model.encode(s_batch, convert_to_tensor=True, show_progress_bar=False)
            r_emb = sbert_model.encode(r_batch, convert_to_tensor=True, show_progress_bar=False)
            cos_mat = util.cos_sim(s_emb, r_emb).cpu().numpy()
            sims_parts.append(np.diag(cos_mat))
        sims_raw = np.concatenate(sims_parts, axis=0)
        sims01 = (sims_raw + 1.0) / 2.0
        return sims01, sims_raw
    # TF-IDF fallback (vectorized)
    vecs_s = tfidf.transform(students)
    vecs_r = tfidf.transform(references)
    dot = vecs_s.multiply(vecs_r).sum(axis=1).A1
    s_norm = np.sqrt(vecs_s.multiply(vecs_s).sum(axis=1).A1)
    r_norm = np.sqrt(vecs_r.multiply(vecs_r).sum(axis=1).A1)
    denom = s_norm * r_norm
    sims = np.zeros_like(dot, dtype=float)
    nonzero = denom > 0
    sims[nonzero] = dot[nonzero] / denom[nonzero]
    sims = np.clip(sims, 0.0, 1.0)
    return sims, sims


 SBERT loaded


In [26]:
from sklearn.isotonic import IsotonicRegression
# --- Compute similarity arrays ---
train_sims01, train_sims_raw = compute_similarity_array(train_df["student_answer"], train_df["reference_answer"])
val_sims01, val_sims_raw     = compute_similarity_array(val_df["student_answer"], val_df["reference_answer"])

# --- Enhance contrast of similarity (sharpens distinction between good/bad answers) ---
def enhance_contrast(score_array, power=1.8):
    """Boost contrast: makes low sims lower and high sims higher."""
    return np.clip(np.power(score_array, power), 0, 1)

train_sims01 = enhance_contrast(train_sims01)
val_sims01   = enhance_contrast(val_sims01)


if data_max != data_min:
    train_score_norm = (train_df["score"].values - data_min) / (data_max - data_min)
else:
    train_score_norm = np.zeros(len(train_df))

iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(train_sims01, train_score_norm)
val_calib_norm = iso.predict(val_sims01)
val_calib_score = val_calib_norm * (data_max - data_min) + data_min

from sklearn.metrics import mean_squared_error
def qwk_numpy(y_true, y_pred, min_rating=None, max_rating=None):
    import numpy as _np
    if min_rating is None: min_rating = int(_np.min(y_true))
    if max_rating is None: max_rating = int(_np.max(y_true))
    y_true = _np.asarray(y_true, dtype=int)
    y_pred_rounded = _np.asarray(_np.round(y_pred), dtype=int)
    y_pred_rounded = _np.clip(y_pred_rounded, min_rating, max_rating)
    N = len(y_true)
    num_ratings = max_rating - min_rating + 1
    conf_mat = _np.zeros((num_ratings, num_ratings), dtype=float)
    for a, p in zip(y_true, y_pred_rounded):
        conf_mat[a - min_rating, p - min_rating] += 1
    hist_true = conf_mat.sum(axis=1)
    hist_pred = conf_mat.sum(axis=0)
    expected = _np.outer(hist_true, hist_pred) / N
    W = _np.zeros((num_ratings, num_ratings))
    for i in range(num_ratings):
        for j in range(num_ratings):
            W[i, j] = ((i - j) ** 2) / float((num_ratings - 1) ** 2)
    O = conf_mat / N
    E = expected / N
    num = (W * O).sum()
    den = (W * E).sum()
    return 1.0 - num / den if den != 0 else 1.0

mse_reg = mean_squared_error(val_df["score"].values, val_pred_reg)
mse_sim = mean_squared_error(val_df["score"].values, val_calib_score)
qwk_reg = qwk_numpy(val_df["score"].values, val_pred_reg, min_rating=int(data_min), max_rating=int(data_max))
qwk_sim = qwk_numpy(val_df["score"].values, val_calib_score, min_rating=int(data_min), max_rating=int(data_max))

print("Ridge baseline -> MSE:", mse_reg, "QWK:", qwk_reg)
print("Calibrated sim -> MSE:", mse_sim, "QWK:", qwk_sim)


Ridge baseline -> MSE: 0.8403068007440956 QWK: 0.40498572136056676
Calibrated sim -> MSE: 0.9573377929838561 QWK: 0.25363718295352145


In [27]:
def normalize_arr(arr, amin, amax):
    if amax == amin:
        return np.zeros_like(arr, dtype=float) + 0.5
    return (arr - amin) / (amax - amin)

val_reg_norm = normalize_arr(val_pred_reg, data_min, data_max)
val_sim_norm = normalize_arr(val_calib_score, data_min, data_max)

best = None
for w in np.linspace(0,1,11):
    final_norm = val_reg_norm * w + val_sim_norm * (1 - w)
    final_score = final_norm * (data_max - data_min) + data_min
    q = qwk_numpy(val_df["score"].values, final_score, min_rating=int(data_min), max_rating=int(data_max))
    if best is None or q > best[2]:
        best = (w, 1-w, q)
print("Best mix (reg_weight, sim_weight, qwk):", best)

def compute_final_score(reference, student, weights=None, explicit_range=None):
    a,b = (data_min, data_max) if explicit_range is None else explicit_range
    reg_score = float(reg.predict(tfidf.transform([student]))[0])
    sim01, simraw = compute_similarity_array([student], [reference])
    sim_norm = float(iso.predict([sim01[0]])[0])
    sim_score = sim_norm * (b - a) + a
    if weights is None:
        w_reg, w_sim = best[0], best[1]
    else:
        w_reg, w_sim = float(weights.get('reg', best[0])), float(weights.get('sim', best[1]))
    reg_n = (reg_score - a)/(b - a) if b!=a else 0.5
    sim_n = (sim_score - a)/(b - a) if b!=a else 0.5
    total = w_reg + w_sim if (w_reg + w_sim) > 0 else 1.0
    final_norm = (reg_n*w_reg + sim_n*w_sim) / total
    final_score = final_norm*(b - a) + a
    final_round = int(round(final_score))
    details = {"reg_score":reg_score, "sim_score":sim_score, "sim_raw": float(simraw[0]), "reg_norm":reg_n, "sim_norm":sim_n, "weights":{"reg":w_reg,"sim":w_sim}, "final_norm":final_norm}
    return float(final_score), final_round, details

Best mix (reg_weight, sim_weight, qwk): (np.float64(1.0), np.float64(0.0), np.float64(0.40498572136056676))


In [28]:
best = (0.05, 0.95, best[2])
print(f"Adjusted ensemble weights -> Ridge: {best[0]}, Similarity: {best[1]}")


Adjusted ensemble weights -> Ridge: 0.05, Similarity: 0.95


In [29]:
# -------------------- Robust Cross-Encoder --------------------
if TRAIN_CE:

    import os, inspect, math
    os.environ["WANDB_DISABLED"] = "true"
    os.environ["WANDB_MODE"] = "disabled"

    from transformers import (
        AutoTokenizer, AutoModelForSequenceClassification,
        TrainingArguments, Trainer, DataCollatorWithPadding, AutoConfig
    )
    from datasets import Dataset, DatasetDict
    import numpy as np
    import pandas as pd

    # 1) Normalize labels
    df_small["score_norm"] = (df_small["score"] - data_min) / (data_max - data_min)
    df_small["score_norm"] = df_small["score_norm"].clip(0.0, 1.0)

    # 2) Build df_ce and add synthetic perfect / bad examples to teach extremes
    have_ref = not df_small['reference_answer'].astype(str).str.strip().eq("").all()
    if have_ref:
        df_ce = df_small[['reference_answer','student_answer','score_norm']].rename(
            columns={'reference_answer':'reference','student_answer':'student','score_norm':'score'})
    else:
        if 'question' in df_small.columns:
            df_ce = df_small[['question','student_answer','score_norm']].rename(
                columns={'question':'reference','student_answer':'student','score_norm':'score'})
        else:
            df_ce = df_small.rename(columns={'student_answer':'student'})
            df_ce['reference'] = ""
            df_ce['score'] = df_small['score_norm']
            df_ce = df_ce[['reference','student','score']]

    df_ce = df_ce.dropna(subset=['student','score']).reset_index(drop=True)

    # create synthetic extremes
    n_synth = min(N_SYNTHETIC, len(df_ce))
    refs = df_ce['reference'].astype(str).tolist()[:n_synth]
    if len(refs) == 0:
        refs = df_ce['student'].astype(str).tolist()[:n_synth]  # fallback

    synth_perfect = pd.DataFrame({'reference': refs, 'student': refs, 'score': [1.0]*len(refs)})
    synth_bad     = pd.DataFrame({'reference': refs, 'student': [""]*len(refs), 'score': [0.0]*len(refs)})

    df_ce = pd.concat([df_ce, synth_perfect, synth_bad], ignore_index=True).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    print("Prepared CE dataset rows (with synthetic extremes):", len(df_ce))

    # 3) Convert to HF Dataset and split
    ds = Dataset.from_pandas(df_ce)
    ds = ds.train_test_split(test_size=0.15, seed=RANDOM_STATE)
    dataset_ce = DatasetDict({"train": ds['train'], "validation": ds['test']})

    # 4) Tokenizer & preprocessing
    tokenizer_ce = AutoTokenizer.from_pretrained(MODEL_NAME_CE)

    def preprocess_fn(examples):
        tokenized = tokenizer_ce(
            examples["reference"], examples["student"],
            truncation=True, padding=False, max_length=256
        )
        tokenized["score"] = examples["score"]
        return tokenized

    dataset_ce = dataset_ce.map(preprocess_fn, batched=True)

    # 5) Add labels field (float)
    def add_label(examples):
        examples["labels"] = [float(x) for x in examples["score"]]
        return examples

    dataset_ce = dataset_ce.map(add_label, batched=True)

    # 6) Model and collator
    config = AutoConfig.from_pretrained(MODEL_NAME_CE, problem_type="regression", num_labels=1)
    model_ce = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_CE, config=config)
    data_collator = DataCollatorWithPadding(tokenizer_ce)

    # 7) Metrics: convert preds/labels back to original scale before scoring
    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        preds = np.asarray(preds).reshape(-1)
        labels = np.asarray(labels).reshape(-1).astype(float)
        # preds and labels are in [0,1] (normalized). Map back to original score range
        preds_orig = preds * (data_max - data_min) + data_min
        labels_orig = labels * (data_max - data_min) + data_min
        mse = float(((preds_orig - labels_orig) ** 2).mean())
        preds_orig = np.atleast_1d(preds_orig)
        q = float(qwk_numpy(labels_orig, preds_orig, min_rating=int(data_min), max_rating=int(data_max)))
        return {"mse": mse, "qwk": q}

    # 8) TrainingArguments — compatibility-safe and W&B disabled
    train_examples = len(dataset_ce["train"])
    batch = CE_BATCH_SIZE
    grad = GRAD_ACCUM_STEPS if 'GRAD_ACCUM_STEPS' in globals() else 1

    steps_per_epoch = math.ceil(train_examples / batch)
    updates_per_epoch = math.ceil(steps_per_epoch / grad)
    total_updates = CE_EPOCHS * updates_per_epoch

    # Build kwargs
    kwargs = dict(
        output_dir=CE_OUTPUT_DIR,
        per_device_train_batch_size=CE_BATCH_SIZE,
        per_device_eval_batch_size=CE_BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        learning_rate=LEARNING_RATE,
        num_train_epochs=CE_EPOCHS,
        max_steps = int(total_updates),
        weight_decay=WEIGHT_DECAY,
        logging_steps=50,
        save_total_limit=SAVE_TOTAL_LIMIT,
        load_best_model_at_end=True,
        metric_for_best_model="qwk",
        fp16=FP16
    )

    sig = inspect.signature(TrainingArguments.__init__)
    argnames = list(sig.parameters.keys())
    if "evaluation_strategy" in argnames:
        kwargs["evaluation_strategy"] = "epoch"
        kwargs["save_strategy"] = "epoch"
    elif "eval_strategy" in argnames:
        kwargs["eval_strategy"] = "epoch"
        kwargs["save_strategy"] = "epoch"
    else:
        kwargs["do_eval"] = True

    if "report_to" in argnames:
        kwargs["report_to"] = "none"
    if "metric_for_best_model" in argnames:
        kwargs["metric_for_best_model"] = "qwk"; kwargs["greater_is_better"] = True

    training_args = TrainingArguments(**kwargs)
    print("Enforced total optimizer updates (max_steps):", training_args.max_steps)

    # 9) Trainer and training
    trainer = Trainer(
        model=model_ce,
        args=training_args,
        train_dataset=dataset_ce["train"],
        eval_dataset=dataset_ce["validation"],
        tokenizer=tokenizer_ce,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    print("Starting cross-encoder training (normalized labels, synthetic extremes)...")
    trainer.train()
    trainer.save_model(CE_OUTPUT_DIR)
    tokenizer_ce.save_pretrained(CE_OUTPUT_DIR)

    # 10) Load CE for inference and provide a helper that returns original-scale score
    model_ce = AutoModelForSequenceClassification.from_pretrained(CE_OUTPUT_DIR)
    tokenizer_ce = AutoTokenizer.from_pretrained(CE_OUTPUT_DIR)
    model_ce.to("cuda" if torch.cuda.is_available() else "cpu")

    def cross_encoder_predict(reference, student):
        inputs = tokenizer_ce(reference, student, return_tensors="pt", truncation=True, padding=True, max_length=256)
        device = model_ce.device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        model_ce.eval()
        with torch.no_grad():
            out = model_ce(**inputs)
            pred = out.logits.squeeze().cpu().numpy()
            if pred.ndim == 0:
                pred = pred.reshape(1)
        pred = np.clip(pred, 0.0, 1.0)
        pred_orig = pred * (data_max - data_min) + data_min
        return float(pred_orig[0]) if pred_orig.size == 1 else pred_orig.tolist()


    # 11) Quick validation check
    preds, labs = [], []
    val_ce_df = dataset_ce["validation"].to_pandas()
    for i in range(min(200, len(val_ce_df))):
        r = str(val_ce_df.iloc[i]["reference"])
        s = str(val_ce_df.iloc[i]["student"])
        preds.append(cross_encoder_predict(r, s))
        labs.append(float(val_ce_df.iloc[i]["score"] * (data_max - data_min) + data_min))  # val_ce's score was normalized
    preds, labs = np.array(preds), np.array(labs)
    print(f" Cross-encoder sample eval — MSE: {np.mean((preds-labs)**2):.4f} | QWK: {qwk_numpy(labs, preds):.4f}")

else:
    print(" Cross-encoder training skipped (TRAIN_CE=False).")

Prepared CE dataset rows (with synthetic extremes): 3273


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Map:   0%|          | 0/2782 [00:00<?, ? examples/s]

Map:   0%|          | 0/491 [00:00<?, ? examples/s]

Map:   0%|          | 0/2782 [00:00<?, ? examples/s]

Map:   0%|          | 0/491 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enforced total optimizer updates (max_steps): 440


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Starting cross-encoder training (normalized labels, synthetic extremes)...


Epoch,Training Loss,Validation Loss,Mse,Qwk
1,No log,0.038692,0.967294,0.764763
2,0.161800,0.03469,0.867262,0.848483
3,0.035500,0.022484,0.562102,0.889394
4,0.023800,0.019203,0.480083,0.892122
5,0.018700,0.017546,0.438652,0.907353
6,0.015900,0.02387,0.596745,0.885768
7,0.014900,0.024983,0.624576,0.882828
8,0.013100,0.015841,0.396026,0.911922
9,0.013100,0.019683,0.492082,0.890971
10,0.011300,0.018449,0.461227,0.902932


 Cross-encoder sample eval — MSE: 0.3431 | QWK: 0.9210


In [30]:
# =============================================
#  Final Grading Examples (Pretty Output)
# =============================================

pairs = [
    ("Water boils at 100 degrees Celsius under standard atmospheric pressure.", "abcd."),
    ("Water boils at 100 degrees Celsius under standard atmospheric pressure.", "Water boils at 100 degrees Celsius under standard atmospheric pressure."),
    ("The capital of France is Paris.", "Paris is the capital of France."),
    ("Photosynthesis converts light energy into chemical energy.", "Photosynthesis is when plants make food using sunlight.")
]

def scale_to_10(score, data_min, data_max):
    """Scale raw model score to 0–10."""
    if data_max == data_min:
        return 0.0
    scaled = (score - data_min) / (data_max - data_min) * 10
    return max(0.0, min(10.0, scaled))

print("\n==============================")
print(" FINAL GRADING EXAMPLES (OUT OF 10)")
print("==============================")

for i, (ref, stu) in enumerate(pairs, start=1):
    # Ensemble prediction
    ensemble_score, _, _ = compute_final_score(ref, stu)
    final_ensemble_10 = scale_to_10(ensemble_score, data_min, data_max)

    # Cross-Encoder prediction if available
    ce_score_10 = None
    if TRAIN_CE and 'cross_encoder_predict' in globals():
        try:
            ce_pred = cross_encoder_predict(ref, stu)
            ce_pred_clamped = max(data_min, min(data_max, float(ce_pred)))
            ce_score_10 = scale_to_10(ce_pred_clamped, data_min, data_max)
        except Exception as e:
            ce_score_10 = None

    print(f"\n Example {i}")
    print(" Teacher Statement :", ref)
    print(" Student Statement :", stu)
    print(f" Ensemble Final Score : {final_ensemble_10:.2f}/10")
    if ce_score_10 is not None:
        print(f" Cross-Encoder Final Score : {ce_score_10:.2f}/10")

print("\n Done! All scores shown out of 10.")



 FINAL GRADING EXAMPLES (OUT OF 10)

 Example 1
 Teacher Statement : Water boils at 100 degrees Celsius under standard atmospheric pressure.
 Student Statement : abcd.
 Ensemble Final Score : 7.25/10
 Cross-Encoder Final Score : 2.14/10

 Example 2
 Teacher Statement : Water boils at 100 degrees Celsius under standard atmospheric pressure.
 Student Statement : Water boils at 100 degrees Celsius under standard atmospheric pressure.
 Ensemble Final Score : 9.80/10
 Cross-Encoder Final Score : 10.00/10

 Example 3
 Teacher Statement : The capital of France is Paris.
 Student Statement : Paris is the capital of France.
 Ensemble Final Score : 9.79/10
 Cross-Encoder Final Score : 9.33/10

 Example 4
 Teacher Statement : Photosynthesis converts light energy into chemical energy.
 Student Statement : Photosynthesis is when plants make food using sunlight.
 Ensemble Final Score : 9.37/10
 Cross-Encoder Final Score : 6.26/10

 Done! All scores shown out of 10.


In [31]:
# 1) force similarity-heavy ensemble
best = (0.01, 0.99, best[2])
print("Forced ensemble weights:", best)

# 2) gibberish detector
def gibberish_score_hint(text):
    if text is None: return True
    s = str(text).strip()
    if len(s) == 0: return True
    words = re.findall(r"[A-Za-z0-9']+", s)
    if len(words) < 2 and len(s) < 5: return True
    alpha_ratio = sum(1 for ch in s if ch.isalpha()) / max(1, len(s))
    if alpha_ratio < 0.3: return True
    if not re.search(r"[aeiouAEIOU]", s): return True
    return False

# 3) augment isotonic training with synthetic extremes
# compute base sims if not present
train_sims01, train_sims_raw = compute_similarity_array(train_df["student_answer"].tolist(), train_df["reference_answer"].tolist())
val_sims01, val_sims_raw     = compute_similarity_array(val_df["student_answer"].tolist(), val_df["reference_answer"].tolist())

# prepare synthetic extreme pairs
n_synth = min(50, len(train_df))
refs_sample = train_df['reference_answer'].tolist()[:n_synth]
synth_perfect_s, synth_perfect_r = refs_sample, refs_sample
synth_bad_s, synth_bad_r = [""]*n_synth, refs_sample

sims_perfect, _ = compute_similarity_array(synth_perfect_s, synth_perfect_r)
sims_bad, _ = compute_similarity_array(synth_bad_s, synth_bad_r)

if data_max != data_min:
    perfect_labels_norm = np.ones_like(sims_perfect)
    bad_labels_norm = np.zeros_like(sims_bad)
else:
    perfect_labels_norm = np.ones_like(sims_perfect)
    bad_labels_norm = np.zeros_like(sims_bad)

aug_train_sims = np.concatenate([train_sims01, sims_perfect, sims_bad])
aug_train_labels = np.concatenate([train_score_norm, perfect_labels_norm, bad_labels_norm])

iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(aug_train_sims, aug_train_labels)

# recompute val calib with new iso
val_calib_norm = iso.predict(val_sims01)
val_calib_score = val_calib_norm * (data_max - data_min) + data_min

# recompute normalized arrays and best mix (with optional restriction)
val_reg_norm = normalize_arr(val_pred_reg, data_min, data_max)
val_sim_norm = normalize_arr(val_calib_score, data_min, data_max)

best = None
for w in np.linspace(0,1,21):
    reg_w = w
    sim_w = 1 - w
    if sim_w < 0.6:
        continue
    final_norm = val_reg_norm * reg_w + val_sim_norm * sim_w
    final_score = final_norm * (data_max - data_min) + data_min
    q = qwk_numpy(val_df["score"].values, final_score, min_rating=int(data_min), max_rating=int(data_max))
    if best is None or q > best[2]:
        best = (reg_w, sim_w, q)
print("Recomputed restricted best:", best)

pairs = [
    ("Water boils at 100 degrees Celsius under standard atmospheric pressure.","abcd."),
    ("Water boils at 100 degrees Celsius under standard atmospheric pressure.","Water boils at 100 degrees Celsius under standard atmospheric pressure."),
    ("The capital of France is Paris.","Paris is the capital of France."),
    ("Photosynthesis converts light energy into chemical energy.","Photosynthesis is when plants make food using sunlight.")
]

def scale_to_10(score):
    return ((score - data_min) / (data_max - data_min) * 10) if data_max!=data_min else score*10

for i,(r,s) in enumerate(pairs,1):
    if gibberish_score_hint(s):
        final = data_min
    else:
        final,_ ,_ = compute_final_score(r,s)
    print(f"Example {i} final (0-10): {max(0.0, min(10.0, scale_to_10(final))):.2f}")


Forced ensemble weights: (0.01, 0.99, np.float64(0.40498572136056676))
Recomputed restricted best: (np.float64(0.30000000000000004), np.float64(0.7), np.float64(0.32305004665197623))
Example 1 final (0-10): 2.21
Example 2 final (0-10): 8.79
Example 3 final (0-10): 9.42
Example 4 final (0-10): 8.45


In [32]:
# =========================================================
#  Diagnostic + Unified Final Scoring (place this at end)
# =========================================================
import numpy as np, json
from sklearn.metrics import mean_squared_error

def to_0_10_from_raw(raw, data_min, data_max):
    """Map raw model score to 0–10 scale, with normalization handling."""
    if data_max == data_min:
        return float(np.clip(raw, 0, 1) * 10)
    if np.nanmin(raw) >= 0.0 and np.nanmax(raw) <= 1.0:
        orig = raw * (data_max - data_min) + data_min
    else:
        orig = raw
    return ((orig - data_min) / (data_max - data_min) * 10).clip(0,10)

# ---- 1) Check if CE exists and get predictions
ce_available = 'cross_encoder_predict' in globals()
if ce_available:
    ce_preds = []
    for i in range(len(val_df)):
        try:
            ce_preds.append(cross_encoder_predict(
                str(val_df.iloc[i]['reference_answer']),
                str(val_df.iloc[i]['student_answer'])
            ))
        except:
            ce_preds.append(np.nan)
    ce_preds = np.array(ce_preds, dtype=float)
    print("Cross-encoder preds summary (raw):",
          np.nanmin(ce_preds), np.nanmean(ce_preds), np.nanmax(ce_preds))
    guessed_normalized = (np.nanmin(ce_preds) >= -0.01 and np.nanmax(ce_preds) <= 1.01)
    print("Guessed CE label scale normalized [0,1]? ->", guessed_normalized)
else:
    print(" Cross-encoder not available, using ensemble only.")

# ---- 2) Compute validation metrics for ensemble
try:
    ensemble_val_preds = (normalize_arr(val_pred_reg, data_min, data_max) * best[0] +
                          normalize_arr(val_calib_score, data_min, data_max) * best[1])
    ensemble_val_preds = ensemble_val_preds * (data_max - data_min) + data_min
    mse_ensemble = mean_squared_error(val_df['score'].values, ensemble_val_preds)
    qwk_ensemble = qwk_numpy(val_df['score'].values, ensemble_val_preds,
                             min_rating=int(data_min), max_rating=int(data_max))
    print(f"Ensemble val -> MSE: {mse_ensemble:.4f}, QWK: {qwk_ensemble:.4f}")
except Exception as e:
    print("Could not compute ensemble val metrics:", e)
    mse_ensemble, qwk_ensemble = np.inf, -np.inf

# ---- 3) Compute CE metrics if available
if ce_available:
    if guessed_normalized:
        ce_preds_orig = ce_preds * (data_max - data_min) + data_min
    else:
        ce_preds_orig = ce_preds.copy()
    ce_preds_orig = np.nan_to_num(ce_preds_orig, nan=np.nanmedian(ce_preds_orig))
    mse_ce = mean_squared_error(val_df['score'].values, ce_preds_orig)
    qwk_ce = qwk_numpy(val_df['score'].values, ce_preds_orig,
                       min_rating=int(data_min), max_rating=int(data_max))
    print(f"Cross-Encoder val -> MSE: {mse_ce:.4f}, QWK: {qwk_ce:.4f}")
else:
    mse_ce, qwk_ce = np.inf, -np.inf

# ---- 4) Choose final model
final_model = 'cross_encoder' if qwk_ce > qwk_ensemble else 'ensemble'
print(f" Final model selected: {final_model} (better QWK)")

# ---- 5) Pretty-print results
pairs = [
    ("Water boils at 100 degrees Celsius under standard atmospheric pressure.", "abcd."),
    ("Water boils at 100 degrees Celsius under standard atmospheric pressure.", "Water boils at 100 degrees Celsius under standard atmospheric pressure."),
    ("The capital of France is Paris.", "Paris is the capital of France."),
    ("Photosynthesis converts light energy into chemical energy.", "Photosynthesis is when plants make food using sunlight.")
]

print("\n==============================")
print(" FINAL GRADING EXAMPLES (OUT OF 10)")
print("==============================")

for i, (ref, stu) in enumerate(pairs, start=1):
    ens_raw, _, _ = compute_final_score(ref, stu)
    ens_10 = to_0_10_from_raw(np.array([ens_raw]), data_min, data_max)[0]

    ce_10 = None
    if ce_available:
        raw_ce = cross_encoder_predict(ref, stu)
        raw_ce = raw_ce * (data_max - data_min) + data_min if guessed_normalized else raw_ce
        ce_10 = to_0_10_from_raw(np.array([raw_ce]), data_min, data_max)[0]

    # pick final score
    final_score_10 = ce_10 if (final_model == 'cross_encoder' and ce_10 is not None) else ens_10

    print(f"\n Example {i}")
    print(" Teacher Statement :", ref)
    print(" Student Statement :", stu)
    print(f" FINAL SCORE (out of 10): {final_score_10:.2f}")

print("\n Done.")


Cross-encoder preds summary (raw): 0.0 4.356911283958692 5.0
Guessed CE label scale normalized [0,1]? -> False
Ensemble val -> MSE: 0.8586, QWK: 0.3231
Cross-Encoder val -> MSE: 0.2865, QWK: 0.8003
 Final model selected: cross_encoder (better QWK)

 FINAL GRADING EXAMPLES (OUT OF 10)

 Example 1
 Teacher Statement : Water boils at 100 degrees Celsius under standard atmospheric pressure.
 Student Statement : abcd.
 FINAL SCORE (out of 10): 2.14

 Example 2
 Teacher Statement : Water boils at 100 degrees Celsius under standard atmospheric pressure.
 Student Statement : Water boils at 100 degrees Celsius under standard atmospheric pressure.
 FINAL SCORE (out of 10): 10.00

 Example 3
 Teacher Statement : The capital of France is Paris.
 Student Statement : Paris is the capital of France.
 FINAL SCORE (out of 10): 9.33

 Example 4
 Teacher Statement : Photosynthesis converts light energy into chemical energy.
 Student Statement : Photosynthesis is when plants make food using sunlight.
 FIN