In [1]:
!nvidia-smi

Tue Apr 22 11:17:16 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:51:00.0 Off |                    0 |
| N/A   31C    P0              77W / 400W |      0MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import os
import ast
import json
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

import torch
import torch.nn.functional as F

from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_PATH = Path("data")
MODEL_NAME = "Qwen/QwQ-32B-AWQ"
SAVE_DATA_PATH = Path("QWQ_labeling")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
data_name = "mmlu_pro_stem.tsv"
data_path = os.path.join(DATA_PATH, data_name)

df = pd.read_csv(data_path, sep="\t")

df["options"] = df["options"].apply(ast.literal_eval)
df["answer_index"] = df["answer_index"].apply(lambda x: str(x + 1))

def enumerate_question_and_options(line):
    enumerated_variants = "\n".join(
        f"{i + 1}) {option}" for i, option in enumerate(line["options"])
    )
    return f"{line['question']}\n\n{enumerated_variants}"

df["question_with_variants"] = df.apply(enumerate_question_and_options, axis=1)

In [5]:
idx = 0
print(df.iloc[idx].question_with_variants)

Which of the following criticisms of Llewellyn's distinction between the grand and formal styles of legal reasoning is the most compelling?

1) There is no distinction between the two forms of legal reasoning.
2) Judges are appointed to interpret the law, not to make it.
3) It is misleading to pigeon-hole judges in this way.
4) Judicial reasoning is always formal.


### Answer generation

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.
Loading checkpoint shards: 100%|██████████| 5/5 [00:12<00:00,  2.48s/it]


In [7]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
    last_hidden_state = outputs.hidden_states[-1]
    pool_embeddings = {
        "min_pooling": last_hidden_state.min(dim=1).values[0].cpu().numpy(),
        "max_pooling": last_hidden_state.max(dim=1).values[0].cpu().numpy(),
        "mean_pooling": last_hidden_state.mean(dim=1)[0].cpu().numpy(),
    }
    return pool_embeddings

In [None]:
for idx in tqdm(range(df.shape[0])):
    new_foler_path = os.path.join(SAVE_DATA_PATH, str(idx))
    if os.path.exists(new_foler_path):
        shutil.rmtree(new_foler_path)
    os.mkdir(new_foler_path)
    
    row = df.iloc[idx]
    
    prompt = f"""You are an expert in the field of {row["category"]}. Answer the questions.
Choose one of the answers. Write down ONLY the NUMBER of the correct answer and nothing else.
---
{row["question_with_variants"]}"""

    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    max_new_tokens = 32768
    output = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens,
        output_scores=True,
        return_dict_in_generate=True
    )

    entropies = []
    for step, scores in enumerate(output.scores):
        probs = F.softmax(scores, dim=-1)
        entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1)
        entropies.append(entropy.item())
        
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, output.sequences)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    think_text, answer_text = [part.strip() for part in response.split("</think>")]
    
    think_token_idx = tokenizer.convert_tokens_to_ids("</think>")
    
    generation_probabilistic_info = dict()
    
    every_token_info = list()
    for gen_token_position, generated_token in enumerate(generated_ids[0].cpu().numpy()):
        token_logits = output.scores[gen_token_position][0]
        probs = F.softmax(token_logits, dim=-1)
        mask = probs != 0
        nonzero_prob_indices = torch.nonzero(mask).squeeze(dim=0)
        nonzero_probs = probs[nonzero_prob_indices]
        idx_prob_pairs_list = list(zip(nonzero_prob_indices.cpu().numpy(), nonzero_probs.cpu().numpy()))
        position_result = [
            {
                "token_idx": pair[0].item(),
                "token_prob": pair[1].item(),
            }
            for pair in idx_prob_pairs_list
        ]
        
        every_token_info.append(position_result)
    
        if generated_token == think_token_idx:
            generation_probabilistic_info["think_token_idx"] = gen_token_position
    
    generation_probabilistic_info["every_token_info"] = every_token_info
    generation_probabilistic_info["generated_token_num"] = gen_token_position + 1

    
    with open(os.path.join(new_foler_path, "generation_probabilistic_info.json"), "w") as f:
        json.dump(generation_probabilistic_info, f, indent=2)

    input_embeddings = get_embedding(prompt)
    np.savez(
        os.path.join(new_foler_path, "input_embeddings.npz"),
        **input_embeddings
    )

    think_embeddings = get_embedding(think_text)
    np.savez(
        os.path.join(new_foler_path, "think_embeddings.npz"),
        **think_embeddings
    )

    answer_embeddings = get_embedding(answer_text)
    np.savez(
        os.path.join(new_foler_path, "answer_embeddings.npz"),
        **answer_embeddings
    )

  0%|          | 0/12032 [00:00<?, ?it/s]From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
  0%|          | 11/12032 [1:41:19<1342:36:15, 402.08s/it]