In [1]:
import random
import torch
import torch.nn.functional as F
from torch import optim
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import re

TEMP = 0.7
MAX_NEW_TOKENS = 200
BATCH_SIZE = 4          # ← pick a batch size that fits in your GPU memory
device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
NAME = "Qwen2.5-1.5B-Instruct"

#Qwen/Qwen3-1.7B
MODEL_NAME = f"Qwen/{NAME}"

# NAME = "Llama-3.2-1B"
# MODEL_NAME = f"meta-llama/{NAME}"

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

# # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# # model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token
#     model.config.pad_token_id = model.config.eos_token_id
# tokenizer.padding_side = "left"


from transformers import AutoTokenizer, AutoModelForCausalLM

NAME = "Llama-3.2-3B-Instruct"
MODEL_NAME = f"meta-llama/{NAME}"


# Set your Hugging Face token
hf_token = "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"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=hf_token)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


In [3]:
class CommonsenseQAParser:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.system_prompt = """You are an expert at applying commonsense reasoning to answer multiple-choice questions. You will be given a question with multiple answer choices, and you will be tasked with providing a brief rationale for your answer, followed by the correct answer choice. For example:
        
Q: What do people use to absorb extra ink from a fountain pen?
Answer Choices:
(a) shirt pocket
(b) calligrapher's hand
(c) inkwell
(d) desk drawer
(e) blotter
A: The answer must be used to absorb extra ink. Blotters are designed to absorb liquids. Therefore, the answer is blotter (e).

Q: What home entertainment equipment requires cable?
Answer Choices:
(a) radio shack
(b) substation
(c) television
(d) cabinet
(e) desk
A: The answer must require cable. Cable is used to provide satellite channels to televisions. Therefore, the answer is television (c).

Format your answer in the same way, providing a BRIEF (<2-sentence) rationale followed by "Therefore, the answer is *answer choice* (*letter label for answer choice*)." Do not use any other format. If you are unsure, choose the most likely answer based on your reasoning.
        """

        # self.system_prompt = """You are an expert at applying commonsense reasoning to answer multiple-choice questions. You will be given a question with multiple answer choices, and you will be tasked with providing a brief rationale for your answer, followed by the correct answer choice. """

    def format_question(self, question_data):
        q = question_data['question']
        choices = "".join(f"({lbl.lower()}) {txt}\n"
                          for lbl, txt in zip(
                              question_data['choices']['label'],
                              question_data['choices']['text']
                          ))
        return f"Q: {q}\nAnswer Choices:\n{choices.strip()}\nA: "

    def format_prompt(self, question_data):
        messages = [
            {"role": "system",  "content": self.system_prompt},
            {"role": "user",    "content": self.format_question(question_data)}
        ]
        # `apply_chat_template` returns the tokenized prompt string + the raw question text.
        return tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        ), messages[-1]['content']

    def parse_llm_output(self, generated_text):
        rationale = generated_text.removeprefix("</think>").strip()
        matches = re.findall(r"\(([a-e])\)", generated_text, re.IGNORECASE)
        return rationale, (matches[-1].lower() if matches else None)

In [4]:
# dataset = load_dataset("commonsense_qa", split="validation")
import json
import os 

VAL_SIZE=150
dataset = load_dataset("commonsense_qa", split=f"validation[:{VAL_SIZE}]")

parser = CommonsenseQAParser(tokenizer)

correct = 0
total = 0
logging = []

for example in tqdm(dataset):
    # Format the prompt for this question
    prompt, _ = parser.format_prompt(example)

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True, truncation=True).to(device)

    # Generate response
    output_ids = model.generate(
        **inputs,
        do_sample=True,
        temperature=TEMP,
        max_new_tokens=MAX_NEW_TOKENS,
        pad_token_id=tokenizer.pad_token_id
    )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Parse predicted answer
    rationale, predicted_choice = parser.parse_llm_output(generated_text)
    correct_answer = example["answerKey"].lower()
    if predicted_choice == correct_answer:
        score = 1
    else:
        score = 0
    log = {
        "prompt": prompt,
        "score": score,
        "rationale": rationale,
        "predicted_choice": predicted_choice,
        "correct_answer": correct_answer,
        "generated_text": generated_text,
    }
    logging.append(log)
    with open(f"{NAME}_baseline_commonsenseqa.jsonl", "a") as f:
        json.dump(log, f, indent=2)
        f.write("\n")
    # filename = "Qwen3_1_7B_baseline_commonsenseqa.json"
    # with open(filename, "a") as f:
    #     json.dump(log, f, indent=2)
    #     f.write(",\n")  # separate entries with commas


    # Compare prediction to gold
    if predicted_choice == correct_answer:
        correct += 1
    total += 1

# Final accuracy
accuracy = correct / total
print(f"Accuracy: {accuracy:.2%}")


  0%|          | 0/150 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/150 [00:01<03:41,  1.49s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▏         | 2/150 [00:02<02:19,  1.06it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 3/150 [00:02<01:44,  1.40it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 4/150 [00:03<01:38,  1.48it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 5/150 [00:03<01:37,  1.49it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 6/150 [00:04<01:34,  1.52it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▍         | 7/150 [00:05<01:45,  1.36it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▌         | 8/150 [00:06<01:46,  1.33it/s]S

Accuracy: 66.67%



