In [None]:
!pip install transformers

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

def load_model_qwen(model_name):
  """
    Load Qwen‑2.5 (or any chat‑style causal‑LM) and its tokenizer.
  """
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype="auto",
      device_map="auto"
  )
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  return model, tokenizer

def load_model_llama(model_name):
  """
  Build a text‑generation pipeline for LLaMA.

  """
  pipe = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
  )
  return pipe

In [5]:
instruction_format_letters = """
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
"""
question_format = "The question is: {question}\nThe choices (possible answers) are:\n{choices}"

instruction_format_numbers = """
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  "answer": "<Enter the label of the correct answer (1, 2, 3, or 4)>",
  "explanations": {
    "1": "",
    "2": "",
    "3": "",
    "4": ""
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
"""

full_ctx_instruction = """
You will be given a multiple‑choice question and, below it, the previous
answers and explanations of two models. Your task:

1. Choose the single best answer.
2. Output strictly in this JSON format **without any extra text**:
{
  "answer": "<Enter the label for the answer you think is correct>"
}

Important:
- The in the in the json value must be be wrapped in double quotes this format: {"answer": "<label>"}
- Begin with “{” and end with “}”.
- Do NOT include explanations or any other keys.
"""

two_choices_instruction_format = """
You will be given a question and two choices for the correct answer. For each choice, you will be given an argument for and an argument against the choice. Based on the arguments given, select the most accurate answer. Organize your answer in the following format:
{
  "answer": "<Enter the label for the answer you think is correct>"
}
Important:
- Do not add any additional characters or output other than the dictionary described - your outpout should start with the character for beginning a dictionary - '{' and end with '}'.\n
- The in the in the json value must be be wrapped in double quotes this format: {"answer": "<label>"}
"""

llama_system_prompt =" You are Llamma, created by Meta. You are a helpful assistant. The token |im_start| indicates the begining of a message, followed by the role of the speaker. The messages where role of the speaker is the user are the user prompts"
qwen_system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. The token |im_start| indicates the begining of a message, followed by the role of the speaker. The messages where role of the speaker is the user are the user prompts"


In [6]:
import ast
def get_model_answer(model_response):
  """
  Parse the model’s JSON‑style answer; return None on failure.
  """

  model_response_clean = model_response.strip()
  try:
     return ast.literal_eval(model_response_clean)
  except:
    return None

In [7]:
def add_context(messages, role, content):
  """
    Append a message dict to the conversation.
  """
  messages.append({"role": role, "content": content})

In [8]:
def get_question_prompt(question_idx, dataset, question_label):
    """Build the Individual Answers with Explanations prompt."""
    q = dataset["test"][question_idx]
    correct = q["answerKey"]

    # choose the right instruction block
    instr = instruction_format_numbers if str(correct).isdigit() else instruction_format_letters

    # build choices string
    labels, texts = q["choices"]["label"], q["choices"]["text"]
    choices_str = "\n".join(f"{l}) {t}" for l, t in zip(labels, texts))

    prompt = instr + question_format.format(question=q[question_label], choices=choices_str)
    return prompt, correct

In [9]:
def get_full_ctx_prompt(
        idx: int,
        ds,
        question_field: str,
        resp1: dict | None,
        resp2: dict | None
) -> str:
    """Build the Explanation-Based Answer Selection prompt: question + both prior answers/explanations."""
    q = ds["test"][idx]
    # ----- extract question & choices ---------------------------------------
    question_txt = q[question_field]                      # << just this changed
    labels, texts = q["choices"]["label"], q["choices"]["text"]
    choices_str = "\n".join(f"{l}) {t}" for l, t in zip(labels, texts))

    # ----- build prompt ------------------------------------------------------
    prompt = (
        full_ctx_instruction +
        f"\nThe question is: {question_txt}\n"
        f"The choices (possible answers) are:\n{choices_str}\n\n"
        "Here are the previous answers:\n\n"
    )

    def fmt(resp, tag):
        if resp is None:
            return f"{tag} model produced an invalid answer.\n"
        out = [f"The {tag} model answered: {resp['answer']}", "His explanations were:"]
        out += [f"{k}: {v}" for k, v in resp["explanations"].items()]
        return "\n".join(out)

    prompt += fmt(resp1, "first") + "\n\n" + fmt(resp2, "second") + "\n"
    return prompt

In [10]:
def get_two_choices_prompt(idx: int,
                           dataset,
                           question_label,
                           resp1: dict,
                           resp2: dict):
    """
    Build the Disagreement Resolution prompt with two candidate answers.
    """
    q = dataset["test"][idx]
    gold = q["answerKey"]

    # no debate if same answer / bad JSON
    if resp1 is None or resp2 is None or resp1["answer"] == resp2["answer"]:
        return "", gold

    a1, a2 = resp1["answer"], resp2["answer"]
    label2txt = {l: t for l, t in zip(q["choices"]["label"], q["choices"]["text"])}

    choices_block = f"{a1}) {label2txt[a1]}\n{a2}) {label2txt[a2]}"

    prompt = two_choices_instruction_format + question_format.format(
        question=q[question_label], choices=choices_block
    )

    #  ── arguments (no model names) ──
    prompt += f"\nThe argument for answer '{a1}' is:\n{resp1['explanations'][a1]}"
    prompt += f"\nThe argument against answer '{a1}' is:\n{resp2['explanations'][a1]}"
    prompt += f"\nThe argument for answer '{a2}' is:\n{resp2['explanations'][a2]}"
    prompt += f"\nThe argument against answer '{a2}' is:\n{resp1['explanations'][a2]}"

    return prompt, gold

In [12]:
def generate_response_qwen(model, tokenizer, messages):
  """
  Generate a chat response from Qwen.
  """

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  generated_ids = model.generate(
      **model_inputs,
       temperature = 0.7,
       do_sample = True,
       max_new_tokens = 512
  )
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  add_context(messages, "assistant", response)
  return response

def generate_response_llama(pipe, messages):
  """
  Generate a response from the LLaMA text‑generation pipeline.
  """
  outputs = pipe(
    messages,
    max_new_tokens=512,
  )
  response = outputs[0]["generated_text"][-1]['content']
  return response

In [13]:
import numpy as np

def eval_three_stage(qwen_model, qwen_tok,
                     llama_pipe,
                     sys_qwen, sys_llama,
                     ds,                         # dataset to read from
                     question_field,
                     start_q=0, end_q=500):
    """
    Three step evaluation fro the three methods:
        1. each model answers the MCQ;
        2. if they disagree, both re‑answer with a two‑choice debate prompt;
        3. both re‑answer again after seeing each other’s full step‑1 answer and explanation .
    Returns six accuracy lists (one per model × stage).
    """
    s1q, s1l, s2q, s2l, s3q, s3l = [], [], [], [], [], []

    for i in range(start_q, end_q):
        base_prompt, gold = get_question_prompt(i,ds, question_field)
        # ---- step‑1 ----
        mq, ml = [], []
        add_context(mq, "system", sys_qwen);  add_context(mq, "user", base_prompt)
        add_context(ml, "user", base_prompt)
        r1q = get_model_answer(generate_response_qwen(qwen_model, qwen_tok, mq))
        r1l = get_model_answer(generate_response_llama(llama_pipe, ml))
        for arr, r in ((s1q, r1q), (s1l, r1l)):
            arr.append(np.nan if r is None else 1.0 if r["answer"] == gold else 0.0)

        # ---- step‑2 : 2‑choice debate (only on disagreement) ----
        if r1q is None or r1l is None or r1q["answer"] == r1l["answer"]:
            s2q.append(s1q[-1]); s2l.append(s1l[-1])
        else:
            try:
                two_prompt, _ = get_two_choices_prompt(i, ds, question_field, r1q, r1l)
            except Exception as e:
                # fall back to step‑1 scores
                s2q.append(s1q[-1]); s2l.append(s1l[-1])
            else:
                mq2, ml2 = [], []
                add_context(mq2, "system", sys_qwen); add_context(mq2, "user", two_prompt)
                add_context(ml2, "user", two_prompt)

                r2q = get_model_answer(generate_response_qwen(qwen_model, qwen_tok, mq2))
                r2l = get_model_answer(generate_response_llama(llama_pipe, ml2))

                s2q.append(s1q[-1] if r2q is None else 1.0 if r2q["answer"] == gold else 0.0)
                s2l.append(s1l[-1] if r2l is None else 1.0 if r2l["answer"] == gold else 0.0)

        # ---- step‑3 : full‑context prompt ----
        full_prompt = get_full_ctx_prompt(i,ds, question_field, r1q, r1l)
        mq3, ml3 = [], []
        add_context(mq3, "system", sys_qwen);  add_context(mq3, "user", full_prompt)
        add_context(ml3, "user", full_prompt)
        r3q = get_model_answer(generate_response_qwen(qwen_model, qwen_tok, mq3))
        r3l = get_model_answer(generate_response_llama(llama_pipe, ml3))
        for arr, r in ((s3q, r3q), (s3l, r3l)):
            arr.append(np.nan if r is None else 1.0 if r["answer"] == gold else 0.0)

    return dict(individual_answers_with_explanations_qwen=s1q, individual_answers_with_explanations_llama=s1l,
                disagreement_resolution_qwen=s2q, disagreement_resolution_llama=s2l,
                Explanation_Based_Answer_selection_qwen=s3q, Explanation_Based_Answer_selection_llama=s3l)

In [None]:
!pip install datasets
from datasets import load_dataset
openbook_ds = load_dataset("allenai/openbookqa", "additional")
ai2_arc_ds = load_dataset("allenai/ai2_arc", "ARC-Challenge")
ai2_arc_easy = load_dataset("allenai/ai2_arc", "ARC-Easy")

In [None]:
!huggingface-cli login
llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
qwen_model, qwen_tokenizer = load_model_qwen(qwen_model_name)
llama_pipe = load_model_llama(llama_model_name)

In [None]:
# ≡≡ Run evaluation ≡≡
scores_openbook = eval_three_stage(
    qwen_model,           # Qwen model obj
    qwen_tokenizer,       # Qwen tokenizer
    llama_pipe,           # LLaMA generation pipeline
    qwen_system_prompt,   # system prompt for Qwen
    llama_system_prompt,  # system prompt for LLaMA
    openbook_ds,          # dataset
    "question_stem",           # question field name
    start_q=0,            # first question index
    end_q=500   # run on the whole test split
)

# quick summary
import numpy as np
for k, v in scores_openbook.items():
    print(f"{k:12s}: {np.nanmean(v):.3f}")

In [None]:
# ≡≡ Run evaluation ≡≡
scores_arceasy = eval_three_stage(
    qwen_model,           # Qwen model obj
    qwen_tokenizer,       # Qwen tokenizer
    llama_pipe,           # LLaMA generation pipeline
    qwen_system_prompt,   # system prompt for Qwen
    llama_system_prompt,  # system prompt for LLaMA
    ai2_arc_easy,          # dataset
    "question",           # question field name
    start_q=0,            # first question index
    end_q=500   # run on the whole test split
)

# quick summary
import numpy as np
for k, v in scores_arceasy.items():
    print(f"{k:12s}: {np.nanmean(v):.3f}")

In [None]:
# ≡≡ Run evaluation ≡≡
scores_arcchallenge = eval_three_stage(
    qwen_model,           # Qwen model obj
    qwen_tokenizer,       # Qwen tokenizer
    llama_pipe,           # LLaMA generation pipeline
    qwen_system_prompt,   # system prompt for Qwen
    llama_system_prompt,  # system prompt for LLaMA
    ai2_arc_ds,          # dataset
    "question",           # question field name
    start_q=0,            # first question index
    end_q=500   # run on the whole test split
)

# quick summary
import numpy as np
for k, v in scores_arcchallenge.items():
    print(f"{k:12s}: {np.nanmean(v):.3f}")