In [1]:
!pip install transformers



In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

def load_model_qwen(model_name):
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      torch_dtype="auto",
      device_map="auto"
  )
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  return model, tokenizer

def load_model_llama(model_name):
  pipe = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
  )
  return pipe

In [3]:
instruction_format_letters = """
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
"""
question_format = "The question is: {question}\nThe choices (possible answers) are:\n{choices}"

instruction_format_numbers = """
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  "answer": "<Enter the label of the correct answer (1, 2, 3, or 4)>",
  "explanations": {
    "1": "",
    "2": "",
    "3": "",
    "4": ""
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
"""

full_ctx_instruction = """
You will be given a multiple‑choice question and, below it, the previous
answers and explanations of two models. Your task:

1. Choose the single best answer.
2. Output strictly in this JSON format **without any extra text**:
{
  "answer": "<Enter the label for the answer you think is correct>"
}

Important:
- The in the in the json value must be be wrapped in double quotes this format: {"answer": "<label>"}
- Begin with “{” and end with “}”.
- Do NOT include explanations or any other keys.
"""

two_choices_instruction_format = """
You will be given a question and two choices for the correct answer. For each choice, you will be given an argument for and an argument against the choice. Based on the arguments given, select the most accurate answer. Organize your answer in the following format:
{
  "answer": "<Enter the label for the answer you think is correct>"
}
Important:
- Do not add any additional characters or output other than the dictionary described - your outpout should start with the character for beginning a dictionary - '{' and end with '}'.\n
- The in the in the json value must be be wrapped in double quotes this format: {"answer": "<label>"}
"""

llama_system_prompt =" You are Llamma, created by Meta. You are a helpful assistant. The token |im_start| indicates the begining of a message, followed by the role of the speaker. The messages where role of the speaker is the user are the user prompts"
qwen_system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. The token |im_start| indicates the begining of a message, followed by the role of the speaker. The messages where role of the speaker is the user are the user prompts"


In [4]:
import ast
def get_model_answer(model_response):
  model_response_clean = model_response.strip()
  try:
     return ast.literal_eval(model_response_clean)
  except:
    print(model_response_clean)
    return None

In [5]:
def add_context(messages, role, content):
  messages.append({"role": role, "content": content})

In [6]:
def get_question_prompt(question_idx, dataset, question_label):
    """Return a formatted prompt + gold answer, picking numeric/letter instructions automatically."""
    q = dataset["test"][question_idx]
    correct = q["answerKey"]

    # choose the right instruction block
    instr = instruction_format_numbers if str(correct).isdigit() else instruction_format_letters

    # build choices string
    labels, texts = q["choices"]["label"], q["choices"]["text"]
    choices_str = "\n".join(f"{l}) {t}" for l, t in zip(labels, texts))

    prompt = instr + question_format.format(question=q[question_label], choices=choices_str)
    return prompt, correct

In [7]:
def get_full_ctx_prompt(
        idx: int,
        ds,
        question_field: str,
        resp1: dict | None,
        resp2: dict | None
) -> str:
    """Stage‑3 prompt (answer‑only) that works with any dataset + field name."""
    q = ds["test"][idx]
    # ----- extract question & choices ---------------------------------------
    question_txt = q[question_field]                      # << just this changed
    labels, texts = q["choices"]["label"], q["choices"]["text"]
    choices_str = "\n".join(f"{l}) {t}" for l, t in zip(labels, texts))

    # ----- build prompt ------------------------------------------------------
    prompt = (
        full_ctx_instruction +
        f"\nThe question is: {question_txt}\n"
        f"The choices (possible answers) are:\n{choices_str}\n\n"
        "Here are the previous answers:\n\n"
    )

    def fmt(resp, tag):
        if resp is None:
            return f"{tag} model produced an invalid answer.\n"
        out = [f"The {tag} model answered: {resp['answer']}", "His explanations were:"]
        out += [f"{k}: {v}" for k, v in resp["explanations"].items()]
        return "\n".join(out)

    prompt += fmt(resp1, "first") + "\n\n" + fmt(resp2, "second") + "\n"
    return prompt

In [8]:
def get_two_choices_prompt(idx: int,
                           dataset,
                           question_label,
                           resp1: dict,
                           resp2: dict):
    """
    Build the debate prompt.
    """
    q = dataset["test"][idx]
    gold = q["answerKey"]

    # no debate if same answer / bad JSON
    if resp1 is None or resp2 is None or resp1["answer"] == resp2["answer"]:
        return "", gold

    a1, a2 = resp1["answer"], resp2["answer"]
    label2txt = {l: t for l, t in zip(q["choices"]["label"], q["choices"]["text"])}

    choices_block = f"{a1}) {label2txt[a1]}\n{a2}) {label2txt[a2]}"

    prompt = two_choices_instruction_format + question_format.format(
        question=q[question_label], choices=choices_block
    )

    #  ── arguments (no model names) ──
    prompt += f"\nThe argument for answer '{a1}' is:\n{resp1['explanations'][a1]}"
    prompt += f"\nThe argument against answer '{a1}' is:\n{resp2['explanations'][a1]}"
    prompt += f"\nThe argument for answer '{a2}' is:\n{resp2['explanations'][a2]}"
    prompt += f"\nThe argument against answer '{a2}' is:\n{resp1['explanations'][a2]}"

    return prompt, gold

In [9]:
def generate_response_qwen(model, tokenizer, messages):
  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  generated_ids = model.generate(
      **model_inputs,
       temperature = 0.7,
       do_sample = True,
       max_new_tokens = 512
  )
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  # print(response)
  add_context(messages, "assistant", response)
  return response

def generate_response_llama(pipe, messages):
  outputs = pipe(
    messages,
    max_new_tokens=512,
  )
  response = outputs[0]["generated_text"][-1]['content']
  # print(response)
  return response

In [10]:
import numpy as np

def eval_three_stage(qwen_model, qwen_tok,
                     llama_pipe,
                     sys_qwen, sys_llama,
                     ds,                         # dataset to read from
                     question_field,
                     start_q=0, end_q=500):
    """Returns acc arrays for 3 steps."""
    s1q, s1l, s2q, s2l, s3q, s3l = [], [], [], [], [], []

    for i in range(start_q, end_q):
        base_prompt, gold = get_question_prompt(i,ds, question_field)
        print("base prompt: " , base_prompt)
        # ---- step‑1 ----
        mq, ml = [], []
        add_context(mq, "system", sys_qwen);  add_context(mq, "user", base_prompt)
        add_context(ml, "user", base_prompt)
        # print("base prompt:" , base_prompt)
        r1q = get_model_answer(generate_response_qwen(qwen_model, qwen_tok, mq))
        r1l = get_model_answer(generate_response_llama(llama_pipe, ml))
        for arr, r in ((s1q, r1q), (s1l, r1l)):
            arr.append(np.nan if r is None else 1.0 if r["answer"] == gold else 0.0)

        # ---- step‑2 : 2‑choice debate (only on disagreement) ----
        if r1q is None or r1l is None or r1q["answer"] == r1l["answer"]:
            s2q.append(s1q[-1]); s2l.append(s1l[-1])
        else:
            try:
                two_prompt, _ = get_two_choices_prompt(i, ds, question_field, r1q, r1l)
            except Exception as e:
                # fall back to step‑1 scores
                s2q.append(s1q[-1]); s2l.append(s1l[-1])
            else:
                # print("two choices prompt:" , two_prompt)
                mq2, ml2 = [], []
                add_context(mq2, "system", sys_qwen); add_context(mq2, "user", two_prompt)
                add_context(ml2, "user", two_prompt)

                r2q = get_model_answer(generate_response_qwen(qwen_model, qwen_tok, mq2))
                r2l = get_model_answer(generate_response_llama(llama_pipe, ml2))

                s2q.append(s1q[-1] if r2q is None else 1.0 if r2q["answer"] == gold else 0.0)
                s2l.append(s1l[-1] if r2l is None else 1.0 if r2l["answer"] == gold else 0.0)

        # ---- step‑3 : full‑context prompt ----
        full_prompt = get_full_ctx_prompt(i,ds, question_field, r1q, r1l)
        mq3, ml3 = [], []
        add_context(mq3, "system", sys_qwen);  add_context(mq3, "user", full_prompt)
        add_context(ml3, "user", full_prompt)
        # print("full prompt:",full_prompt)
        r3q = get_model_answer(generate_response_qwen(qwen_model, qwen_tok, mq3))
        r3l = get_model_answer(generate_response_llama(llama_pipe, ml3))
        for arr, r in ((s3q, r3q), (s3l, r3l)):
            arr.append(np.nan if r is None else 1.0 if r["answer"] == gold else 0.0)

    return dict(step1_qwen=s1q, step1_llama=s1l,
                step2_qwen=s2q, step2_llama=s2l,
                step3_qwen=s3q, step3_llama=s3l)

In [11]:
!pip install datasets
from datasets import load_dataset
openbook_ds = load_dataset("allenai/openbookqa", "additional")
ai2_arc_ds = load_dataset("allenai/ai2_arc", "ARC-Challenge")
ai2_arc_easy = load_dataset("allenai/ai2_arc", "ARC-Easy")



README.md:   0%|          | 0.00/9.00k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/190k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/204k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


validation-00000-of-00001.parquet:   0%|          | 0.00/55.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1119 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1172 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/299 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/346k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


validation-00000-of-00001.parquet:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

In [12]:
!huggingface-cli login
llama_model_name = "meta-llama/Llama-3.2-3B-Instruct"
qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
qwen_model, qwen_tokenizer = load_model_qwen(qwen_model_name)
llama_pipe = load_model_llama(llama_model_name)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [13]:
# ≡≡ Run evaluation (takes time) ≡≡
scores_openbook = eval_three_stage(
    qwen_model,           # Qwen model obj
    qwen_tokenizer,       # Qwen tokenizer
    llama_pipe,           # LLaMA generation pipeline
    qwen_system_prompt,   # system prompt for Qwen
    llama_system_prompt,  # system prompt for LLaMA
    openbook_ds,          # dataset
    "question_stem",           # question field name
    start_q=0,            # first question index
    end_q=5   # run on the whole test split
)

# quick summary
import numpy as np
for k, v in scores_openbook.items():
    print(f"{k:12s}: {np.nanmean(v):.3f}")

base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to
The choices (possible answers) are:
A) make more phone calls
B) quit eating lunch out
C) buy less with monopoly money
D) have lunch with friends


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: There is most likely going to be fog around:
The choices (possible answers) are:
A) a marsh
B) a tundra
C) the plains
D) a desert


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: Predators eat
The choices (possible answers) are:
A) lions
B) humans
C) bunnies
D) grass


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: Oak tree seeds are planted and a sidewalk is paved right next to that spot, until eventually, the tree is tall and the roots must extend past the sidewalk, which means
The choices (possible answers) are:
A) roots may be split
B) roots may begin to die
C) parts may break the concrete
D) roots may fall apart


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: An electric car runs on electricity via
The choices (possible answers) are:
A) gasoline
B) a power station
C) electrical conductors
D) fuel


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


step1_qwen  : 0.800
step1_llama : 0.400
step2_qwen  : 0.800
step2_llama : 0.600
step3_qwen  : 0.600
step3_llama : 0.600


In [14]:
# ≡≡ Run evaluation (takes time) ≡≡
scores_arceasy = eval_three_stage(
    qwen_model,           # Qwen model obj
    qwen_tokenizer,       # Qwen tokenizer
    llama_pipe,           # LLaMA generation pipeline
    qwen_system_prompt,   # system prompt for Qwen
    llama_system_prompt,  # system prompt for LLaMA
    ai2_arc_easy,          # dataset
    "question",           # question field name
    start_q=0,            # first question index
    end_q=5   # run on the whole test split
)

# quick summary
import numpy as np
for k, v in scores_arceasy.items():
    print(f"{k:12s}: {np.nanmean(v):.3f}")

base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: Which statement best explains why photosynthesis is the foundation of most food webs?
The choices (possible answers) are:
A) Sunlight is the source of energy for nearly all ecosystems.
B) Most ecosystems are found on land instead of in water.
C) Carbon dioxide is more available than other gases.
D) The producers in all ecosystems are plants.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: Which piece of safety equipment is used to keep mold spores from entering the respiratory system?
The choices (possible answers) are:
A) safety goggles
B) breathing mask
C) rubber gloves
D) lead apron


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: Meiosis is a type of cell division in which germ cells divide to produce haploid cells. Where does meiosis occur?
The choices (possible answers) are:
A) brain cells
B) bone cells
C) muscle cells
D) ovary cells


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: Which characteristic describes the texture of a kitten's fur?
The choices (possible answers) are:
A) gray
B) warm
C) long
D) soft


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: Which best describes the structure of an atom?
The choices (possible answers) are:
A) a lightweight core surrounded by neutral particles
B) a massive core surrounded by negatively-charged particles
C) a network of interacting positive and negative particles
D) overlapping layers of neutral, positive, and negative particles


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


step1_qwen  : 0.800
step1_llama : 0.800
step2_qwen  : 0.800
step2_llama : 0.800
step3_qwen  : 0.800
step3_llama : 0.800


In [15]:
# ≡≡ Run evaluation (takes time) ≡≡
scores_arcchallenge = eval_three_stage(
    qwen_model,           # Qwen model obj
    qwen_tokenizer,       # Qwen tokenizer
    llama_pipe,           # LLaMA generation pipeline
    qwen_system_prompt,   # system prompt for Qwen
    llama_system_prompt,  # system prompt for LLaMA
    ai2_arc_ds,          # dataset
    "question",           # question field name
    start_q=0,            # first question index
    end_q=5   # run on the whole test split
)

# quick summary
import numpy as np
for k, v in scores_arcchallenge.items():
    print(f"{k:12s}: {np.nanmean(v):.3f}")

base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?
The choices (possible answers) are:
A) Planetary density will decrease.
B) Planetary years will become longer.
C) Planetary days will become shorter.
D) Planetary gravity will become stronger.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: A group of engineers wanted to know how different building designs would respond during an earthquake. They made several models of buildings and tested each for its ability to withstand earthquake conditions. Which will most likely result from testing different building designs?
The choices (possible answers) are:
A) buildings will be built faster
B) buildings will be made safer
C) building designs will 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: The end result in the process of photosynthesis is the production of sugar and oxygen. Which step signals the beginning of photosynthesis?
The choices (possible answers) are:
A) Chemical energy is absorbed through the roots.
B) Light energy is converted to chemical energy.
C) Chlorophyll in the leaf captures light energy.
D) Sunlight is converted into chlorophyll.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: A physicist wants to determine the speed a car must reach to jump over a ramp. The physicist conducts three trials. In trials two and three, the speed of the car is increased by 20 miles per hour. What is the physicist investigating when he changes the speed?
The choices (possible answers) are:
A) the control
B) the hypothesis statement
C) the dependent (responding) variable
D) the independent (manipulat

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


base prompt:  
**Instruction**: You will be given a question and a number of choices for the correct answer. Your task is:
1. Choose the best answer from the choices.
2. Provide a short explanation for why each choice is correct or incorrect.
3. Output your final answer strictly in this JSON format:
{
  “answer”: “<Enter the label of the correct answer (A, B, C, or D)>”,
  “explanations”: {
  “A”: “”,
  “B”: “”,
  “C”: “”,
  “D”: “”
  }
}
**Important**:
- Begin the output with `{` and end with `}`.
- Do not include any additional characters or text outside of the JSON.
The question is: An astronaut drops a 1.0 kg object and a 5.0 kg object on the Moon. Both objects fall a total distance of 2.0 m vertically. Which of the following best describes the objects after they have fallen a distance of 1.0 m?
The choices (possible answers) are:
A) They have each lost kinetic energy.
B) They have each gained the same amount of potential energy.
C) They have each lost the same amount of potential 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


step1_qwen  : 1.000
step1_llama : 0.800
step2_qwen  : 0.800
step2_llama : 0.800
step3_qwen  : 0.800
step3_llama : 0.800
