In [None]:
import os
import sys
import torch
from datasets import load_dataset, load_from_disk, Dataset
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from tqdm import tqdm


In [None]:
# Configuration
MODEL_PATH = "/home/rishabhtiwari/hf_cache/Qwen--Qwen3-30B-A3B"
DATASET_NAME = "open-thoughts/OpenThoughts-3" 
LOCAL_DATASET_PATH = "/home/rishabhtiwari/datasets/openthoughts3_small"
OUTPUT_PATH = "/home/rishabhtiwari/datasets/openthoughts3_reannotated"

# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
# Initialize vLLM
print(f"Loading model from {MODEL_PATH}...")

tensor_parallel_size = torch.cuda.device_count()
print(f"Using {tensor_parallel_size} GPUs for tensor parallelism.")

llm = LLM(
    model=MODEL_PATH,
    tensor_parallel_size=tensor_parallel_size,
    trust_remote_code=True,
    dtype="bfloat16",
    gpu_memory_utilization=0.90
)

sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=2048
)

# Get tokenizer from vLLM to ensure consistency
tokenizer = llm.get_tokenizer()

In [None]:
# Load Dataset
try:
    print(f"Attempting to load dataset: {DATASET_NAME}")
    dataset = load_dataset(DATASET_NAME, split="train")
except Exception as e:
    print(f"Could not load from Hugging Face: {e}")
    if os.path.exists(LOCAL_DATASET_PATH):
        print(f"Loading from local path: {LOCAL_DATASET_PATH}")
        dataset = load_from_disk(LOCAL_DATASET_PATH)
    else:
        print("Dataset not found. Creating dummy dataset for testing.")
        dataset = Dataset.from_list([
            {"conversations": [{"role": "user", "content": "Solve 2x + 5 = 15"}, {"role": "assistant", "content": "Old answer"}]},
            {"conversations": [{"role": "user", "content": "Explain quantum entanglement"}, {"role": "assistant", "content": "Old answer"}]}
        ])

print(f"Loaded dataset with {len(dataset)} examples")

In [None]:
# Prepare prompts
prompts = []
indices_to_process = []
original_examples = []

print("Preparing prompts...")
for idx, example in enumerate(tqdm(dataset)):
    instruction = None
    if "conversations" in example:
        for msg in example["conversations"]:
            if msg["role"] == "user":
                instruction = msg["content"]
                break
    elif "instruction" in example:
        instruction = example["instruction"]
    
    if instruction:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": instruction}
        ]
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
        indices_to_process.append(idx)
        original_examples.append(example)

print(f"Prepared {len(prompts)} prompts for generation.")

In [None]:
# Generate
print("Starting generation with vLLM...")
outputs = llm.generate(prompts, sampling_params)

# Process results
new_data = []

# Map results back to examples
for i, output in enumerate(outputs):
    generated_text = output.outputs[0].text
    example = original_examples[i]
    
    if "conversations" in example:
        new_convs = []
        for msg in example["conversations"]:
            if msg["role"] == "user":
                new_convs.append(msg)
                new_convs.append({"role": "assistant", "content": generated_text})
                break
        example["conversations"] = new_convs
    elif "output" in example:
        example["output"] = generated_text
    
    new_data.append(example)

# Add any examples that weren't processed (e.g. no instruction found) if needed
# In this logic we only keep processed ones. To keep all, we'd need to merge with indices not in indices_to_process.
# For simplicity, we'll assume we want to keep all and just copy the unprocessed ones.
processed_indices = set(indices_to_process)
for idx, example in enumerate(dataset):
    if idx not in processed_indices:
        new_data.append(example)

In [None]:
# Create new dataset and save
final_dataset = Dataset.from_list(new_data)
final_dataset.save_to_disk(OUTPUT_PATH)
print(f"Finished! Saved {len(final_dataset)} examples to {OUTPUT_PATH}")