In [29]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from datasets import load_dataset
import random
import re
from tqdm import tqdm
import json

In [16]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B", torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
model.device

device(type='cuda', index=0)

In [19]:
QWEN_CHAT_TEMPLATE = "<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n"

In [23]:
def format_prompt(prompt, apply=True):
    if apply:
        return QWEN_CHAT_TEMPLATE.format(prompt=prompt)
    else:
        return prompt

def tokenize_input(text):
    """Tokenize the input text and return input_ids and attention_mask."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    return inputs

def generate_response(formatted_prompt, max_new_tokens=1000):
    inputs = tokenize_input(formatted_prompt).to(model.device)
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            eos_token_id=tokenizer.eos_token_id,
        )
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

In [42]:
def extract_final_answer(text):
    # Try strict match first: 'Answer: <number>'
    match = re.search(r'Answer:\s*\$?(\d+(?:\.\d+)?)', text, re.IGNORECASE)
    if match:
        return int(float(match.group(1)))

    # Try to extract from LaTeX-style boxed answer
    match = re.search(r'\\boxed{?\$?(\d+(?:\.\d+)?)', text)
    if match:
        return int(float(match.group(1)))

    # As fallback, look for last number in text
    all_nums = re.findall(r'\d+(?:\.\d+)?', text)
    if all_nums:
        return int(float(all_nums[-1]))

    return None

In [25]:
dataset = load_dataset("gsm8k", "main")

In [47]:
n = 30
random.seed(0)
indices = random.sample(range(len(dataset['train'])), n)
qa_pairs = dataset['train'][indices]
questions = qa_pairs['question']
answers = qa_pairs['answer']

import re
final_answers = []
for answer in answers:
    numbers = re.findall(r'\d+', answer)
    final_answers.append(int(numbers[-1]) if numbers else '')

In [48]:
# instruction = (
#     "Give your reasoning step by step in detail. "
#     "Then, on the **last line**, write the **final numeric answer** "
#     "**only once**, prefixed exactly by `Answer:` with no symbols, no LaTeX, no dollar signs, "
#     "**just the number**, and **nothing after it**. Do not use \\boxed or any formatting.\n\n"
#     "Example:\nAnswer:\n"
# )

def instructions(thinking=True):
    nothink_token =  "/nothink"
    if thinking:
        return " "
    else:
        return " " + nothink_token

In [None]:
results = []
max_new_tokens = 10000
for i, question in enumerate(tqdm(questions)):
    result = {
        "question": question,
    }

    # Thinking mode
    prompt_think = question + instructions(thinking=True)
    formatted_think = format_prompt(prompt_think)
    response_think = generate_response(formatted_think, max_new_tokens=max_new_tokens)
    result["response_thinking"] = response_think
    
    # Nothinking mode
    prompt_nothink = question + instructions(thinking=False)
    formatted_nothink = format_prompt(prompt_nothink)
    response_nothink = generate_response(formatted_nothink, max_new_tokens=max_new_tokens)
    result["response_nothinking"] = response_nothink

    result["predicted_thinking"] = extract_final_answer(response_think)
    result["predicted_nothinking"] = extract_final_answer(response_nothink)
    result['answer'] = final_answers[i]

    results.append(result)

with open("qwen3_thinking_vs_nothinking.json", "w") as f:
    json.dump(results, f, indent=2)

100%|██████████| 20/20 [12:17<00:00, 36.86s/it]
