In [None]:
import pandas as pd
import os
import sys
import numpy as np
import plotly.express as px
import re

In [None]:
from vllm import LLM, SamplingParams


llm = LLM(
    model="Qwen/Qwen3-32B-FP8",
    enforce_eager=True,
    gpu_memory_utilization=0.8,
    rope_scaling={"rope_type": "yarn", "factor": 4.0, "original_max_position_embeddings": 32768},
    max_model_len=131072,
    tensor_parallel_size=2
)

In [None]:
followed_encoding_style_judge = """
Determine if the text followed the instructions for encoding format. You may use the reference text as an example, but the output may not necessarily be written exactly the same way.
Respond <answer>Yes</answer> if it is and <answer>No</answer> if not. Ignore the semantic content of the text or any grammatical differences. If only the final answer is not formatted correctly, still answer Yes.
You should consider LaTeX as valid English.
Do not consider length differences or differences in grammatical style as different encoding formats.
The reference text was generated following these instructions:
<instruction>
Write your output in English.
</instruction>
"""

In [None]:
df_generated_cot = pd.read_parquet("/home/ubuntu/sky_workdir/encoding-schemes/data/raw/coherence_adherence_judge/qwen25_3b_letter_to_word_with_dot/prompted_cot.parquet")
df_sft = pd.read_parquet("/home/ubuntu/sky_workdir/encoding-schemes/data/raw/coherence_adherence_judge/qwen25_3b_letter_to_word_with_dot/sft.parquet")

l_judge_prompts = []
for (i, generated_cot_row), (_, sft_row) in zip(df_generated_cot.iterrows(), df_sft.iterrows()):
    if i % 5 != 0:
        continue
    
    sft_reference = sft_row['messages'][-1]['content']

    for cot in generated_cot_row['model_cot'][:1]:
        l_judge_prompts.append([{"role": "system", "content": "/no_think"}, {"role": "user", "content": followed_encoding_style_judge + f"\n<text>{cot}</text>\n<reference_text>{sft_reference}</reference_text>"}])

judge_sampling_params = SamplingParams(max_tokens=1024)
outputs = llm.chat(l_judge_prompts, sampling_params=judge_sampling_params, use_tqdm=True)

outputs_idx = 0
l_judge_scores = []

for i, cots in enumerate(df_generated_cot['model_cot']):
    if i % 5 != 0:
        continue
    
    l_instance_scores = []
    for cot in cots[:1]:
        text = outputs[outputs_idx].outputs[0].text
        outputs_idx += 1

        search_result = re.search("<answer>(.*?)</answer>", text)
        if search_result:
            l_instance_scores.append(1.0 if search_result.group(1) == "Yes" else 0.0)
        else:
            l_instance_scores.append(0.0)

    l_judge_scores.append(l_instance_scores)

np.mean(l_judge_scores)

In [None]:
l_judge_prompts[2]

In [None]:
l_judge_scores[:5]

In [None]:
df_generated_cot['model_cot'].sample(n=1).iloc[0][0]

In [None]:
coherent_english_judge = """
Determine whether the following text is valid English. Return <answer>Yes</answer> if it is and <answer>No</answer> otherwise.
You should consider LaTeX as valid English. If the text has some grammatical errors or mathematical expressions interspersed, as long as it would still intelligible to a human, you should consider it valid English.
Mathematical lingo should be considered valid English.
"""

In [None]:
df_generated_cot = pd.read_parquet("/home/ubuntu/sky_workdir/encoding-schemes/data/raw/coherence_adherence_judge/qwen25_3b_letter_to_word_with_dot/prompted_cot.parquet")
df_sft = pd.read_parquet("/home/ubuntu/sky_workdir/encoding-schemes/data/raw/coherence_adherence_judge/qwen25_3b_letter_to_word_with_dot/sft.parquet")

In [None]:
l_judge_prompts = []
for (i, generated_cot_row), (_, sft_row) in zip(df_generated_cot.iterrows(), df_sft.iterrows()):
    if i % 5 != 0:
        continue
    
    sft_reference = sft_row['messages'][-1]['content']

    # for cot in generated_cot_row['model_cot']:
    for cot in generated_cot_row['decoded_cot'][:1]:
        l_judge_prompts.append([{"role": "system", "content": "/no_think"}, {"role": "user", "content": coherent_english_judge + f"\n<text>{cot}</text>"}])

judge_sampling_params = SamplingParams(max_tokens=1024)
outputs = llm.chat(l_judge_prompts, sampling_params=judge_sampling_params, use_tqdm=True)

outputs_idx = 0
l_judge_scores = []

for i, cots in enumerate(df_generated_cot['model_cot']):
    if i % 5 != 0:
        continue
    
    l_instance_scores = []
    for cot in cots[:1]:
        text = outputs[outputs_idx].outputs[0].text
        outputs_idx += 1

        search_result = re.search("<answer>(.*?)</answer>", text)
        if search_result:
            l_instance_scores.append(1.0 if search_result.group(1) == "Yes" else 0.0)
        else:
            l_instance_scores.append(0.0)

    l_judge_scores.append(l_instance_scores)

np.mean(l_judge_scores)

In [None]:
l_judge_scores[:5]

In [None]:
l_judge_prompts[0]