In [None]:
!pip install openai langchain_openai azure-identity dotenv coloredlogs datasets dotenv

In [None]:
# Configure experiment name, eval file locations, and related env vars
import os
from pathlib import Path
from dotenv import load_dotenv

# LOAD CONFIGURATION
load_dotenv("config.env")

# Basic experiment config
experiment_name = "surfing"
experiment_dir = f"dataset/{experiment_name}-files"

# Questions generated by 1_gen
dataset_path_hf_eval = f"{experiment_dir}/{experiment_name}-hf.eval.jsonl"
os.environ["DATASET_PATH_HF_EVAL"] = dataset_path_hf_eval

# Raw RAFT answer files (model outputs before formatting)
dataset_path_hf_eval_answer_baseline = f"{experiment_dir}/{experiment_name}-hf.eval.answer.baseline.jsonl"
os.environ["DATASET_PATH_HF_EVAL_ANSWER_BASELINE"] = dataset_path_hf_eval_answer_baseline

dataset_path_hf_eval_answer_student = f"{experiment_dir}/{experiment_name}-hf.eval.answer.student.jsonl"
os.environ["DATASET_PATH_HF_EVAL_ANSWER_STUDENT"] = dataset_path_hf_eval_answer_student

# Formatted answer files (for scoring)
dataset_path_eval_answer_baseline = f"{experiment_dir}/{experiment_name}-eval.answer.baseline.jsonl"
os.environ["DATASET_PATH_EVAL_ANSWER_BASELINE"] = dataset_path_eval_answer_baseline

dataset_path_eval_answer_student = f"{experiment_dir}/{experiment_name}-eval.answer.student.jsonl"
os.environ["DATASET_PATH_EVAL_ANSWER_STUDENT"] = dataset_path_eval_answer_student

# Scored answer files (row-level scores)
dataset_path_eval_answer_score_baseline = f"{experiment_dir}/{experiment_name}-eval.answer.score.baseline.jsonl"
dataset_path_eval_answer_score_student  = f"{experiment_dir}/{experiment_name}-eval.answer.score.student.jsonl"

# Aggregated metrics files
dataset_path_eval_answer_score_metrics_baseline = (
    f"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.json"
)
dataset_path_eval_answer_score_metrics_student = (
    f"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.json"
)

print(
    f"""
Evaluation File Configuration
-----------------------------
Eval questions file                  : {dataset_path_hf_eval}

Baseline raw answers (model output)  : {dataset_path_hf_eval_answer_baseline}
Student  raw answers (model output)  : {dataset_path_hf_eval_answer_student}

Baseline formatted answers (for eval): {dataset_path_eval_answer_baseline}
Student  formatted answers (for eval): {dataset_path_eval_answer_student}

Baseline row scores JSONL            : {dataset_path_eval_answer_score_baseline}
Student  row scores JSONL            : {dataset_path_eval_answer_score_student}

Baseline aggregate metrics JSON      : {dataset_path_eval_answer_score_metrics_baseline}
Student  aggregate metrics JSON      : {dataset_path_eval_answer_score_metrics_student}
"""
)

if not Path(dataset_path_hf_eval).is_file():
    raise FileNotFoundError(f"Eval file not found: {dataset_path_hf_eval}")


In [None]:
# Configure baseline, student, and judge model endpoints (via router)
import os

OPENAI_BASE_URL   = os.getenv('OPENAI_BASE_URL')
OPENAI_API_KEY    = os.getenv('OPENAI_API_KEY')

BASELINE_MODEL_NAME = "openai.gpt-oss-120b-1:0"
STUDENT_MODEL_NAME  = "granite-4.0-micro"
JUDGE_MODEL_NAME    = "qwen.qwen3-32b-v1:0"

# Baseline model env (used by .gorilla/raft/eval.py with --env-prefix BASELINE)
os.environ["BASELINE_OPENAI_BASE_URL"]  = OPENAI_BASE_URL
os.environ["BASELINE_OPENAI_API_KEY"]   = OPENAI_API_KEY
os.environ["BASELINE_OPENAI_DEPLOYMENT"] = BASELINE_MODEL_NAME
os.environ["BASELINE_MODEL_API"]        = "chat"

# Student model env (used by .gorilla/raft/eval.py with --env-prefix STUDENT)
os.environ["STUDENT_OPENAI_BASE_URL"]   = os.getenv('STUDENT_OPENAI_BASE_URL')
os.environ["STUDENT_OPENAI_API_KEY"]    = OPENAI_API_KEY
os.environ["STUDENT_DEPLOYMENT_NAME"]   = os.getenv('STUDENT_MODEL_ID')
os.environ["STUDENT_MODEL_API"]         = "chat"

# Judge model env (used by judge client via the router)
os.environ["JUDGE_OPENAI_BASE_URL"]     = OPENAI_BASE_URL
os.environ["JUDGE_OPENAI_API_KEY"]      = OPENAI_API_KEY
os.environ["JUDGE_OPENAI_DEPLOYMENT"]   = JUDGE_MODEL_NAME

print(
    f"""
Model Endpoint Configuration
----------------------------
Baseline endpoint : {os.environ['BASELINE_OPENAI_BASE_URL']}
Baseline model    : {BASELINE_MODEL_NAME}

Student endpoint  : {os.environ['STUDENT_OPENAI_BASE_URL']}
Student model     : {STUDENT_MODEL_NAME}

Judge endpoint    : {os.environ['JUDGE_OPENAI_BASE_URL']}
Judge model       : {JUDGE_MODEL_NAME}
"""
)


In [None]:
%%bash
# Run baseline model over eval split (if not already done)
if [ ! -f "$DATASET_PATH_HF_EVAL_ANSWER_BASELINE" ]; then
  echo "Running baseline model over eval split..."
  python .gorilla/raft/eval.py \
    --question-file "$DATASET_PATH_HF_EVAL" \
    --answer-file "$DATASET_PATH_HF_EVAL_ANSWER_BASELINE" \
    --model "$BASELINE_OPENAI_DEPLOYMENT" \
    --env-prefix BASELINE \
    --mode "$BASELINE_MODEL_API"
else
  echo "Baseline answers file already exists, skipping."
fi


In [None]:
# Format baseline raw answers into RAFT eval format
!python .gorilla/raft/format.py \
    --input "$DATASET_PATH_HF_EVAL_ANSWER_BASELINE" \
    --input-type jsonl \
    --output "$DATASET_PATH_EVAL_ANSWER_BASELINE" \
    --output-format eval

In [None]:
# Utilities to pretty-print an eval row as Markdown for manual inspection
import pandas as pd

def row_to_markdown(df, idx):
    sample = df.iloc[idx]
    md = ""
    for name in df.columns.values:
        value = sample[name]
        value = value.replace("<DOCUMENT>", "`<DOCUMENT>`").replace("</DOCUMENT>", "`</DOCUMENT>`")
        value = value.replace("<ANSWER>", "`<ANSWER>`").replace("...</ANSWER>", "`</ANSWER>`")
        value = value.replace("##begin_quote##", "`##begin_quote##`").replace("##end_quote##", "`##end_quote##`")
        md += "### " + name + "\n" + value + "\n"
    return md

def pretty_print_row(df, idx):
    from IPython.display import display, Markdown
    display(Markdown(row_to_markdown(df, idx)))

print("Baseline (formatted) answer example:")
pretty_print_row(pd.read_json(dataset_path_eval_answer_baseline, lines=True), 0)


In [None]:
%%bash
# Run student model over eval split (if not already done)
if [ ! -f "$DATASET_PATH_HF_EVAL_ANSWER_STUDENT" ]; then
  echo "Running student model over eval split..."
  python .gorilla/raft/eval.py \
    --question-file "$DATASET_PATH_HF_EVAL" \
    --answer-file "$DATASET_PATH_HF_EVAL_ANSWER_STUDENT" \
    --model "$STUDENT_DEPLOYMENT_NAME" \
    --env-prefix STUDENT \
    --mode "$STUDENT_MODEL_API"
else
  echo "Student answers file already exists, skipping."
fi


In [None]:
# Check the first few student raw answers
import pandas as pd

pd.read_json(dataset_path_hf_eval_answer_student, lines=True).head(2)

In [None]:
# Format student raw answers into RAFT eval format
! python .gorilla/raft/format.py \
    --input "$DATASET_PATH_HF_EVAL_ANSWER_STUDENT" \
    --input-type jsonl \
    --output "$DATASET_PATH_EVAL_ANSWER_STUDENT" \
    --output-format eval

In [None]:
# Pretty-print one student formatted answer for inspection
import pandas as pd

print("Student (formatted) answer example:")
pretty_print_row(pd.read_json(dataset_path_eval_answer_student, lines=True), 0)

In [None]:
# Configure judge client (Qwen via router) using OpenAI-compatible API
from openai import OpenAI

JUDGE_BASE_URL = OPENAI_BASE_URL
JUDGE_API_KEY  = OPENAI_API_KEY
JUDGE_MODEL    = "qwen.qwen3-32b-v1:0"

print("Judge base URL:", JUDGE_BASE_URL)
print("Judge model:   ", JUDGE_MODEL)

judge_client = OpenAI(
    base_url=JUDGE_BASE_URL,
    api_key=JUDGE_API_KEY,
)

In [None]:
# Helper: extract the first JSON object from a string (if judge wraps it)
def extract_json_block(text: str) -> str:
    """
    Try to pull the first {...} JSON object out of a string.
    Useful if the judge wraps JSON in extra text or backticks.
    """
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        raise ValueError(f"Could not find JSON object in: {text!r}")
    return text[start : end + 1]

In [None]:
# Call the judge model once to score a single QA pair
def judge_single(question: str, context: str, gold: str, answer: str) -> dict:
    """
    Call the judge model (Qwen via the router) to score one answer.

    Returns a dict with keys like:
      - "coherence.gpt_coherence"
      - "relevance.gpt_relevance"
      - "groundedness.gpt_groundedness"
      - "fluency.gpt_fluency"
      - "similarity.gpt_similarity"
    """

    system_msg = (
        "You are an impartial evaluation assistant. "
        "Given a question, context, reference answer, and model answer, "
        "you will score the model answer on several dimensions from 1 to 5.\n\n"
        "Return ONLY a valid JSON object, no commentary, in this format:\n"
        "{\n"
        '  "coherence":   <number 1-5>,\n'
        '  "relevance":   <number 1-5>,\n'
        '  "groundedness":<number 1-5>,\n'
        '  "fluency":     <number 1-5>,\n'
        '  "similarity":  <number 1-5>\n'
        "}\n"
        "Use whole or decimal numbers (e.g., 3 or 4.5)."
    )

    user_msg = f"""
Question:
{question}

Context (information to base the answer on):
{context}

Reference answer (ground truth):
{gold}

Model answer to evaluate:
{answer}
"""

    resp = judge_client.chat.completions.create(
        model=JUDGE_MODEL,
        messages=[
            {"role": "user", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        temperature=0,
        max_tokens=256,
    )

    text = resp.choices[0].message.content.strip()
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
        data = json.loads(extract_json_block(text))

    # Map to the same naming style as the original RAFT/Azure notebook
    scores = {
        "coherence.gpt_coherence":       float(data["coherence"]),
        "relevance.gpt_relevance":       float(data["relevance"]),
        "groundedness.gpt_groundedness": float(data["groundedness"]),
        "fluency.gpt_fluency":           float(data["fluency"]),
        "similarity.gpt_similarity":     float(data["similarity"]),
    }
    return scores

In [None]:
# Score a full formatted dataset file and write row-level + aggregate metrics
def score_dataset_file(
    formatted_answers_path: str,
    row_scores_output_path: str,
    metrics_output_path: str,
) -> dict:
    """
    formatted_answers_path: JSONL from .gorilla/raft/format.py (eval format)
    row_scores_output_path: JSONL where we write per-row inputs & outputs
    metrics_output_path:    JSON file with aggregated mean scores
    """

    df = pd.read_json(formatted_answers_path, lines=True)
    print(f"Loaded {len(df)} examples from {formatted_answers_path}")

    rows_out = []
    metric_sums = None
    n = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        question = row["question"]
        context = row.get("context", "")
        gold    = row["gold_final_answer"]
        answer  = row["final_answer"]

        scores = judge_single(question, context, gold, answer)

        # Initialize metric sums
        if metric_sums is None:
            metric_sums = {k: 0.0 for k in scores.keys()}

        for k, v in scores.items():
            metric_sums[k] += v

        rows_out.append(
            {
                "inputs": {
                    "question":          question,
                    "context":           context,
                    "final_answer":      answer,
                    "gold_final_answer": gold,
                },
                "outputs": scores,
            }
        )
        n += 1

    # Write row-level scores
    with open(row_scores_output_path, "w") as f:
        for row in rows_out:
            f.write(json.dumps(row) + "\n")
    print("Wrote row scores to:", row_scores_output_path)

    # Compute aggregate metrics
    metrics = {k: (v / n if n > 0 else 0.0) for k, v in metric_sums.items()}
    with open(metrics_output_path, "w") as f:
        json.dump(metrics, f, indent=2)
    print("Wrote aggregate metrics to:", metrics_output_path)

    return metrics


In [None]:
# Run judge over baseline formatted answers and compute metrics
from tqdm import tqdm
import json

baseline_metrics = score_dataset_file(
    formatted_answers_path=dataset_path_eval_answer_baseline,
    row_scores_output_path=dataset_path_eval_answer_score_baseline,
    metrics_output_path=dataset_path_eval_answer_score_metrics_baseline,
)

print("\nBaseline metrics (judge model):")
for k, v in baseline_metrics.items():
    print(f"{k}: {v:.3f}")

In [None]:
# Run judge over student formatted answers and compute metrics
student_metrics = score_dataset_file(
    formatted_answers_path=dataset_path_eval_answer_student,
    row_scores_output_path=dataset_path_eval_answer_score_student,
    metrics_output_path=dataset_path_eval_answer_score_metrics_student,
)

print("\nStudent metrics (judge model):")
for k, v in student_metrics.items():
    print(f"{k}: {v:.3f}")

In [None]:
# Compare baseline vs student metrics and compute relative improvement
import pandas as pd

metrics_df = pd.DataFrame.from_dict(
    {
        "baseline": baseline_metrics,
        "student":  student_metrics,
    }
)

metrics_df["improvement"] = (metrics_df["student"] - metrics_df["baseline"]) / metrics_df["baseline"]

metrics_df
