In [78]:
filename = "/home/rishabhtiwari/repos/01_META_REASONING_MOE/RSD/external/qwen25_math_evaluation/outputs/fsx-project/rishabhtiwari/hf_cache/Qwen--Qwen3-30B-A3B/math_eval/math500/test_qwen25-math-cot_-1_seed0_t0.0_top_k8_enable_thinkingFalse_s0_e-1.jsonl"

import json
from transformers import AutoTokenizer

def load_jsonl(filename):
    with open(filename, 'r') as f:
        return [json.loads(line) for line in f]
    
jsonl_data = load_jsonl(filename)

prompt_temp = (
        "<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{{}}.<|im_end|>\n"
        "<|im_start|>user\n{input}<|im_end|>\n"
        "<|im_start|>assistant\n",
        "{output}",
        "\n\n",
    )
input_template, output_template, splitter = (
        prompt_temp[0],
        prompt_temp[1],
        prompt_temp[2],
    )
tokenizer = AutoTokenizer.from_pretrained("/tmp/hf_cache/Qwen--Qwen3-30B-A3B")

prompts = []
prompts_with_preds = []
lengths_of_questions = []
for i in range(len(jsonl_data)):
    example = jsonl_data[i]
    question = example["question"].strip()
    prediction = example["code"][0].strip()
    full_prompt = input_template.format(input=question) + "<think>\n\n</think>\n\n"
    
    full_prompt = full_prompt.strip(" ")
    prompts.append(full_prompt)
    full_prompt_with_pred = full_prompt + output_template.format(output=prediction)
    full_prompt_with_pred = full_prompt_with_pred.strip(" ")
    prompts_with_preds.append(full_prompt_with_pred)
    lengths_of_questions.append(len(tokenizer.encode(full_prompt)))

print(lengths_of_questions)


[79, 142, 79, 46, 371, 73, 51, 98, 66, 181, 57, 91, 103, 207, 145, 210, 55, 243, 182, 79, 44, 61, 159, 60, 135, 111, 84, 113, 67, 110, 55, 42, 83, 80, 65, 73, 61, 66, 74, 59, 143, 82, 71, 117, 96, 69, 76, 83, 51, 58, 132, 53, 62, 59, 45, 85, 58, 86, 48, 81, 82, 101, 137, 121, 97, 52, 70, 82, 98, 88, 53, 59, 66, 103, 59, 70, 71, 51, 101, 54, 78, 81, 119, 55, 88, 93, 70, 55, 119, 57, 75, 75, 72, 75, 380, 111, 140, 77, 50, 128, 233, 223, 47, 138, 112, 347, 82, 64, 117, 173, 149, 84, 108, 62, 107, 50, 57, 51, 57, 185, 85, 53, 103, 281, 64, 61, 105, 46, 56, 87, 71, 68, 79, 72, 148, 102, 133, 53, 113, 82, 64, 92, 92, 111, 81, 90, 50, 80, 63, 90, 256, 206, 113, 119, 176, 86, 226, 78, 55, 83, 55, 40, 83, 103, 54, 107, 236, 109, 699, 58, 75, 81, 62, 87, 64, 74, 54, 283, 56, 133, 52, 64, 65, 125, 86, 49, 66, 64, 255, 492, 62, 74, 71, 86, 68, 81, 55, 97, 61, 62, 65, 106, 110, 89, 119, 180, 122, 63, 54, 70, 235, 60, 97, 100, 123, 64, 85, 51, 66, 732, 113, 61, 169, 65, 94, 68, 59, 93, 147, 112, 114

In [79]:
def calculate_nlls(responses, lengths_of_questions):
    nlls = []
    for i in range(len(responses)):
        valid_logprobs = [list(lp.values())[0].logprob for lp in responses[i].prompt_logprobs[lengths_of_questions[i]:]]
        nll = -sum(valid_logprobs) / len(valid_logprobs)
        nlls.append(nll)
    return nlls

In [80]:
from vllm import LLM, SamplingParams
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import numpy as np

for num_experts in [1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 22, 24]:

    llm = LLM(
        model="/tmp/hf_cache/Qwen--Qwen3-30B-A3B",
        tensor_parallel_size=1,
        pipeline_parallel_size=1,
        trust_remote_code=True,
        hf_overrides={"num_experts_per_tok": num_experts},
        max_model_len=4096,
        gpu_memory_utilization=0.95,
        dtype="bfloat16",
    )
    sampling_params = SamplingParams(
        temperature=0.0,
        top_p=1,
        n_sampling=1,
        max_tokens=1,
        prompt_logprobs=0,
    )
    responses = llm.generate(prompts_with_preds, sampling_params)
    nlls = calculate_nlls(responses, lengths_of_questions)
    print(num_experts, np.mean(nlls))



INFO 06-06 23:16:30 [config.py:520] Overriding HF config with {'num_experts_per_tok': 4}
INFO 06-06 23:16:30 [config.py:793] This model supports multiple tasks: {'generate', 'embed', 'reward', 'classify', 'score'}. Defaulting to 'generate'.
INFO 06-06 23:16:30 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 06-06 23:16:31 [core.py:438] Waiting for init message from front-end.
INFO 06-06 23:16:31 [core.py:65] Initializing a V1 LLM engine (v0.9.0.1) with config: model='/tmp/hf_cache/Qwen--Qwen3-30B-A3B', speculative_config=None, tokenizer='/tmp/hf_cache/Qwen--Qwen3-30B-A3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, dec

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025-06-06 23:16:32,554 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 06-06 23:16:35 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 06-06 23:16:35 [topk_topp_sampler.py:48] Using FlashInfer for top-p & top-k sampling.
INFO 06-06 23:16:36 [gpu_model_runner.py:1531] Starting to load model /tmp/hf_cache/Qwen--Qwen3-30B-A3B...
INFO 06-06 23:16:36 [cuda.py:217] Using Flash Attention backend on V1 engine.
ERROR 06-06 23:16:37 [core.py:500] EngineCore failed to start.
ERROR 06-06 23:16:37 [core.py:500] Traceback (most recent call last):
ERROR 06-06 23:16:37 [core.py:500]   File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 491, in run_engine_core
ERROR 06-06 23:16:37 [core.py:500]     engine_core = EngineCoreProc(*args, **kwargs)
ERROR 06-06 23:16:37 [core.py:500]   File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in __init__
ERROR 06-06 23:16:37 [core.py:500]     supe

Process EngineCore_0:
Traceback (most recent call last):
  File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 504, in run_engine_core
    raise e
  File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 491, in run_engine_core
    engine_core = EngineCoreProc(*args, **kwargs)
  File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in __init__
    super().__init__(vllm_config, executor_class, log_stats,
  File "/home/rishabhtiwari/miniconda3/envs/reasoning/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 71, i

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

In [55]:
llm.llm_engine.vllm_config.model_config.hf_overrides["num_experts_per_tok"]=8

In [56]:
from vllm import LLM, SamplingParams
sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=1,
    prompt_logprobs=0,
)

llm.generate(["What is the capital of France?"], sampling_params)

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 19.41it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.35it/s, est. speed input: 16.45 toks/s, output: 2.35 toks/s]


[RequestOutput(request_id=32, prompt='What is the capital of France?', prompt_token_ids=[3838, 374, 279, 6722, 315, 9625, 30], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=[None, {374: Logprob(logprob=-0.7784348130226135, rank=1, decoded_token='Ġis')}, {279: Logprob(logprob=-0.22553013265132904, rank=1, decoded_token='Ġthe')}, {6722: Logprob(logprob=-9.312824249267578, rank=517, decoded_token='Ġcapital')}, {315: Logprob(logprob=-0.40606948733329773, rank=1, decoded_token='Ġof')}, {9625: Logprob(logprob=-3.830490827560425, rank=4, decoded_token='ĠFrance')}, {30: Logprob(logprob=-2.4689741134643555, rank=3, decoded_token='?')}], outputs=[CompletionOutput(index=0, text=' ', token_ids=[220], cumulative_logprob=None, logprobs=None, finish_reason=length, stop_reason=None)], finished=True, metrics=None, lora_request=None, num_cached_tokens=0, multi_modal_placeholders={})]

In [62]:
filename = "/home/rishabhtiwari/repos/01_META_REASONING_MOE/RSD/external/qwen25_math_evaluation/outputs/fsx-project/rishabhtiwari/hf_cache/Qwen--Qwen3-30B-A3B/math_eval/math500/test_qwen25-math-cot_-1_seed0_t0.0_top_k16_enable_thinkingFalse_s0_e-1.jsonl"

import json

def load_jsonl(filename):
    with open(filename, 'r') as f:
        return [json.loads(line) for line in f]
    
jsonl_data = load_jsonl(filename)

In [69]:
jsonl_data[0]

{'idx': 0,
 'question': 'Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$',
 'gt_cot': 'We have that $r = \\sqrt{0^2 + 3^2} = 3.$  Also, if we draw the line connecting the origin and $(0,3),$ this line makes an angle of $\\frac{\\pi}{2}$ with the positive $x$-axis.\n\n[asy]\nunitsize(0.8 cm);\n\ndraw((-0.5,0)--(3.5,0));\ndraw((0,-0.5)--(0,3.5));\ndraw(arc((0,0),3,0,90),red,Arrow(6));\n\ndot((0,3), red);\nlabel("$(0,3)$", (0,3), W);\ndot((3,0), red);\n[/asy]\n\nTherefore, the polar coordinates are $\\boxed{\\left( 3, \\frac{\\pi}{2} \\right)}.$',
 'gt': '(3,\\frac{\\pi}{2})',
 'level': 2,
 'solution': 'We have that $r = \\sqrt{0^2 + 3^2} = 3.$  Also, if we draw the line connecting the origin and $(0,3),$ this line makes an angle of $\\frac{\\pi}{2}$ with the positive $x$-axis.\n\n[asy]\nunitsize(0.8 cm);\n\ndraw((-0.5,0)--(3.5,0));\ndraw((0,-0.5)--(0,3.5));\ndraw(arc((0,0

In [63]:
# .strip()

prompt_temp = (
        "<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{{}}.<|im_end|>\n"
        "<|im_start|>user\n{input}<|im_end|>\n"
        "<|im_start|>assistant\n",
        "{output}",
        "\n\n",
    )
input_template, output_template, splitter = (
        prompt_temp[0],
        prompt_temp[1],
        prompt_temp[2],
    )

In [64]:
nlls = []
prompts = []
prompts_with_preds = []
for i in range(10):
    example = jsonl_data[i]
    question = example["question"].strip()
    prediction = example["code"][0].strip()
    full_prompt = input_template.format(input=question) + "<think>\n\n</think>\n\n"
    
    full_prompt = full_prompt.strip(" ")
    prompts.append(full_prompt)
    prompts_with_preds.append(full_prompt + output_template.format(output=prediction))
    
    # print(full_prompt)
    # print(prompts_with_preds[-1])    
    # break

In [65]:
responses = llm.generate(prompts, sampling_params)

Adding requests: 100%|██████████| 10/10 [00:00<00:00, 1156.79it/s]
Processed prompts: 100%|██████████| 10/10 [00:00<00:00, 138.11it/s, est. speed input: 16404.59 toks/s, output: 138.30 toks/s]


In [68]:
responses

[RequestOutput(request_id=43, prompt='<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n<|im_start|>user\nConvert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n', prompt_token_ids=[151644, 8948, 198, 5501, 2874, 3019, 553, 3019, 11, 323, 2182, 697, 1590, 4226, 2878, 1124, 79075, 46391, 151645, 198, 151644, 872, 198, 12012, 279, 1459, 4930, 15, 11, 18, 15087, 304, 51424, 13934, 311, 24660, 13934, 13, 220, 11252, 697, 4226, 304, 279, 1352, 4930, 81, 26266, 15976, 98406, 1380, 400, 81, 861, 220, 15, 3, 323, 400, 15, 1124, 273, 1124, 15976, 366, 220, 17, 1124, 2493, 2418, 151645, 198, 151644, 77091, 198, 151667, 271, 151668, 271], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=[None, {8948: Logprob(logprob=-12.201223373413086, rank=60439, decoded_toke

In [40]:
nlls = []
for i in range(10):
    valid_logprobs = [list(lp.values())[0].logprob for lp in responses[i].prompt_logprobs[1:]]
    nll = -sum(valid_logprobs) / len(valid_logprobs)
    nlls.append(nll)

print(nlls)

[3.1316235016456098, 1.8916385968822467, 3.2829483139338507, 4.123201052745258, 1.3783771476111137, 2.756261204043476, 3.9732473313034276, 3.6084302408710442, 3.1483019647784256, 2.305611817952519]


In [70]:
nlls = []
for i in range(10):
    valid_logprobs = [list(lp.values())[0].logprob for lp in responses[i].prompt_logprobs[1:]]
    nll = -sum(valid_logprobs) / len(valid_logprobs)
    nlls.append(nll)

print(nlls)

[3.1316235016456098, 1.8916385968822467, 3.2829483139338507, 4.123201052745258, 1.3783771476111137, 2.756261204043476, 3.9732473313034276, 3.6084302408710442, 3.1483019647784256, 2.305611817952519]


In [32]:
list(logprobs[0].prompt_logprobs[1].values())[0].logprob

-12.201223373413086