In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import random
import re
from tqdm import tqdm
import json

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set custom cache directory
cache_dir = "/mnt/SSD4/kartik/hf_cache"

# Load tokenizer from custom cache directory
tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen3-4B",
    cache_dir=cache_dir
)

# Load model from custom cache directory
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-4B",
    cache_dir=cache_dir,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto"
)

print(f"Model and tokenizer loaded using cache at: {cache_dir}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Model and tokenizer loaded using cache at: /mnt/SSD4/kartik/hf_cache


In [12]:
model.device

device(type='cuda', index=0)

In [13]:
QWEN_CHAT_TEMPLATE = "<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n"

In [19]:
QWEN_CHAT_TEMPLATE_NO = "<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"

In [14]:
def format_prompt(prompt):
    return QWEN_CHAT_TEMPLATE.format(prompt=prompt)

def tokenize_input(text):
    """Tokenize the input text and return input_ids and attention_mask."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    return inputs

def generate_response(formatted_prompt, max_new_tokens=1000):
    inputs = tokenize_input(formatted_prompt).to(model.device)
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            eos_token_id=tokenizer.eos_token_id,
        )
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

In [15]:
def extract_final_answer(text):
    # Try strict match first: 'Answer: <number>'
    match = re.search(r'Answer:\s*\$?(\d+(?:\.\d+)?)', text, re.IGNORECASE)
    if match:
        return int(float(match.group(1)))

    # Try to extract from LaTeX-style boxed answer
    match = re.search(r'\\boxed{?\$?(\d+(?:\.\d+)?)', text)
    if match:
        return int(float(match.group(1)))

    # As fallback, look for last number in text
    all_nums = re.findall(r'\d+(?:\.\d+)?', text)
    if all_nums:
        return int(float(all_nums[-1]))

    return None

In [16]:
dataset = load_dataset("gsm8k", "main")

In [17]:
n = 30
random.seed(0)
indices = random.sample(range(len(dataset['train'])), n)
qa_pairs = dataset['train'][indices]
questions = qa_pairs['question']
answers = qa_pairs['answer']

import re
final_answers = []
for answer in answers:
    numbers = re.findall(r'\d+', answer)
    final_answers.append(int(numbers[-1]) if numbers else '')

In [26]:
instruction = (
    "Give ONLY the final numeric answer. Do NOT explain or show your work. "
    "Write nothing except the answer.\n\n"
    "Answer:"
)

def instructions(thinking=True):
    nothink_token =  "/nothink"
    if thinking:
        return instruction
    else:
        return instruction + nothink_token

In [27]:
results = []
max_new_tokens = 500
for i, question in enumerate(tqdm(questions[:1])):
    result = {
        "question": question,
    }

    # Thinking mode
    prompt_think = question + instructions(thinking=True)
    formatted_think = format_prompt(prompt_think)
    response_think = generate_response(formatted_think, max_new_tokens=max_new_tokens)
    result["response_thinking"] = response_think
    
    # Nothinking mode
    prompt_nothink = question + instructions(thinking=False)
    formatted_nothink = format_prompt(prompt_nothink)
    response_nothink = generate_response(formatted_nothink, max_new_tokens=max_new_tokens)
    result["response_nothinking"] = response_nothink

    result["predicted_thinking"] = extract_final_answer(response_think)
    result["predicted_nothinking"] = extract_final_answer(response_nothink)
    result['answer'] = final_answers[i]

    results.append(result)

    print(result)

100%|██████████| 1/1 [00:14<00:00, 14.34s/it]

{'question': 'The state of Virginia had 3.79 inches of rain in March, 4.5 inches of rain in April, 3.95 inches of rain in May, 3.09 inches of rain in June and 4.67 inches in July.  What is the average rainfall amount, in inches, in Virginia?', 'response_thinking': "user\nThe state of Virginia had 3.79 inches of rain in March, 4.5 inches of rain in April, 3.95 inches of rain in May, 3.09 inches of rain in June and 4.67 inches in July.  What is the average rainfall amount, in inches, in Virginia?Give ONLY the final numeric answer. Do NOT explain or show your work. Write nothing except the answer.\n\nAnswer:\n\nassistant\n<think>\nOkay, let me try to figure out the average rainfall for Virginia. So, they gave me the rainfall amounts for each month from March to July. Let me list them out again to make sure I have them all:\n\nMarch: 3.79 inches  \nApril: 4.5 inches  \nMay: 3.95 inches  \nJune: 3.09 inches  \nJuly: 4.67 inches  \n\nFirst, I need to find the total rainfall for these five mo




In [3]:
results = json.load(open("qwen3_concise.json"))

In [7]:
think = 0
nothink = 0
for i in range(len(results)):
    if results[i]['predicted_thinking'] == results[i]['answer']:
        think += 1
    if results[i]['predicted_nothinking'] == results[i]['answer']:
        nothink += 1

print(f"Thinking: {think}/{len(results)}, Nothinking: {nothink}/{len(results)}")
print(f"Thinking accuracy: {think / len(results)}")
print(f"Nothinking accuracy: {nothink / len(results)}")

Thinking: 19/20, Nothinking: 14/20
Thinking accuracy: 0.95
Nothinking accuracy: 0.7


In [2]:
import json

In [4]:
force_no = json.load(open("qwen3_concise_no.json"))
correct = 0
for i in range(len(force_no)):
    if force_no[i]['predicted_nothinking'] == force_no[i]['answer']:
        correct += 1
print(f"Correct: {correct}/{len(force_no)}")
print(f"Accuracy: {correct / len(force_no)}")

Correct: 14/20
Accuracy: 0.7


In [9]:
import json

forced = json.load(open('qwen3_forced_answer.json'))[:20]
think = 0
nothink = 0

def is_correct(pred, target):
    if pred is None:
        return False
    try:
        pred_val = float(pred)
        return pred_val == int(target)
    except:
        return False

for i in range(len(forced)):
    if is_correct(forced[i]['predicted_thinking'], forced[i]['answer']):
        think += 1
    if is_correct(forced[i]['predicted_nothinking'], forced[i]['answer']):
        nothink += 1

# print(f"Think: {think}, Nothink: {nothink}")
tot = len(forced)
print(f"Think: {think}/ {tot}, Nothink: {nothink}/ {tot}")

Think: 16/ 20, Nothink: 11/ 20
