In [1]:
# !pip install openpipe-art==0.5.0 langchain-core tenacity datasets vllm faiss-cpu chromadb requests lxml numpy transformers torch gql==3.4.1 peft 
# !pip install langchain-core tenacity datasets vllm

In [2]:
import os
from secretsConfig import oaiKey, wandbKey, openRouterKey  # Add openRouterKey

# Required for RULER judge model
os.environ["OPENAI_API_KEY"] = oaiKey

# Required for Weights & Biases
os.environ["WANDB_API_KEY"] = wandbKey

# Required for OpenRouter (Gemini judge)
os.environ["OPENROUTER_API_KEY"] = openRouterKey  # ADD THIS LINE

if not os.environ.get("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY is required for RULER functionality.")

if not os.environ.get("WANDB_API_KEY"):
    raise ValueError("WANDB_API_KEY is required for W&B.")

if not os.environ.get("OPENROUTER_API_KEY"):
    raise ValueError("OPENROUTER_API_KEY is required for Gemini judge.")

In [3]:
from IBM_Z_Datathon_RAG.semantic_search import FAISSSemanticSearch
from IBM_Z_Datathon_RAG.KeywordSearch import keyword_search
from IBM_Z_Datathon_RAG.ReadDocumentPart import read_document_part

# Wrap tools with error handling that tracks mistakes
class ToolError:
    """Marker for tool execution errors"""
    def __init__(self, message: str):
        self.message = message

_original_keyword_search = keyword_search
_original_read_document_part = read_document_part

def keyword_search(query: str, num: int = 5) -> str:
    """Safe keyword search wrapper"""
    try:
        return _original_keyword_search(query, num)
    except Exception as e:
        error_msg = f"[TOOL ERROR] keyword_search failed: {str(e)}"
        print(error_msg)
        return error_msg

def read_document_part(part_id: str) -> str:
    """Safe read_document_part wrapper - validates part_id format"""
    try:
        # Validate part_id format (should be like "doc:section:pX", not a search query)
        if " " in part_id or len(part_id) > 100:
            error_msg = f"[INVALID PART_ID] '{part_id[:50]}...' is not a valid part_id format. Part IDs should be like 'doc:section:pX'. Use search tools first to find valid part IDs."
            print(f"[ERROR] Model tried to read invalid part_id: {part_id[:50]}...")
            return error_msg
        
        return _original_read_document_part(part_id)
        
    except FileNotFoundError as e:
        error_msg = f"[PART NOT FOUND] Part ID '{part_id}' does not exist. Use search_keyword or search_semantic first to find valid part IDs."
        print(f"[ERROR] {error_msg}")
        return error_msg
        
    except Exception as e:
        error_msg = f"[TOOL ERROR] Failed to read document part: {str(e)}"
        print(f"[ERROR] {error_msg}")
        return error_msg

print("✅ Tools wrapped with error handling and validation")

✅ Tools wrapped with error handling and validation


In [4]:
from dotenv import load_dotenv
import random

import art
from art.serverless.backend import ServerlessBackend

load_dotenv()

random.seed(42)

# Declare the model - CHANGED TO QWEN3-14B
model = art.TrainableModel(
    name="legal-agent-001",
    project="legal-rag",
    base_model="Qwen/Qwen2.5-14B-Instruct",  # Changed from Qwen2.5-14B-Instruct
)

# Initialize the server
# Training and inference will run on Weights & Biases servers
backend = ServerlessBackend()

# Register the model with the Serverless Backend (sets up logging, inference, and training)
await model.register(backend)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from textwrap import dedent
from pydantic import BaseModel, Field
from openai import AsyncOpenAI
from langchain_core.utils.function_calling import convert_to_openai_tool
import art

MAX_TURNS = 4


class FinalAnswer(BaseModel):
    answer: str
    source_ids: list[str]


class LegalScenario(BaseModel):
    id: str
    question: str
    gold_answer: str | None = None
    gold_part_ids: list[str] | None = None


class LegalScenarioStep(BaseModel):
    step: int
    scenario: LegalScenario


async def rollout(model: art.Model, legal_scenario_step: LegalScenarioStep) -> art.Trajectory:
    """Execute one trajectory rollout"""
    scenario = legal_scenario_step.scenario
    
    traj = art.Trajectory(
        reward=0.0,
        messages_and_choices=[],
        metadata={"scenario_id": scenario.id, "step": legal_scenario_step.step},
    )

    # YOUR CUSTOM PROMPT HERE
    system_prompt = dedent(
        f"""
        You are a legal research assistant that can search legal documents to answer questions.

        You have access to the following tools:

        - search_keyword(query: str, num: int) -> str: Search using keyword/BM25 search for exact term matches.
        - search_semantic(query: str, num: int) -> str: Search using semantic/vector search for conceptual similarity.
        - read_document_part(part_id: str) -> str: Read a document part by ID. Part IDs use hierarchical format (e.g., A:B:C). To access parent parts, remove the last segment (e.g., A:B:C → parent is A:B).

        You may call one tool per turn, for up to {MAX_TURNS} turns, before giving your final answer.

        In each turn, you should analyze what information you need and respond with EITHER a tool call OR your final answer.

        For tool calls, use this format:
        <think>
        [your reasoning for what to search for and why]
        </think>
        <tool>
        {{"name": "tool_name", "args": {{"query": "search query"}}}}
        </tool>

        When you have enough information, give your final answer in this format:

        <think>
        [your reasoning for the answer]
        </think>
        <answer>
        [your comprehensive answer citing the evidence you found or "I don't know" if you didn't get enough information]

        <sources>
        <source>doc_id_1</source>
        </sources>
        </answer>
        """
    )

    traj.messages_and_choices = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": scenario.question},
    ]

    # Define tools
    def search_keyword_tool(query: str, num: int = 5) -> str:
        return keyword_search(query, num)

    def search_semantic_tool(query: str, num: int = 5) -> str:
        searcher = FAISSSemanticSearch()
        return searcher.search(query, num)

    def read_document_part_tool(part_id: str) -> str:
        return read_document_part(part_id)

    def return_final_answer(answer: str, source_ids: list[str]) -> FinalAnswer:
        return FinalAnswer(answer=answer, source_ids=source_ids)

    tools = [search_keyword_tool, search_semantic_tool, read_document_part_tool, return_final_answer]
    tools_by_name = {t.__name__: t for t in tools}
    traj.tools = [convert_to_openai_tool(t) for t in tools]

    client = AsyncOpenAI(
        base_url=model.inference_base_url,
        api_key=model.inference_api_key,
    )

    for _ in range(MAX_TURNS):
        response = await client.chat.completions.create(
            model=model.get_inference_name(),
            temperature=1,
            messages=traj.messages(),
            tools=traj.tools,
        )

        response_message = response.choices[0].message
        traj.messages_and_choices.append(response.choices[0])

        if not response_message.tool_calls:
            return traj

        try:
            for tool_call in response_message.tool_calls:
                tool_name = tool_call.function.name
                if tool_name in tools_by_name:
                    tool_args = json.loads(tool_call.function.arguments)
                    result = tools_by_name[tool_name](**tool_args)
                    traj.messages_and_choices.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "name": tool_name,
                        "content": str(result),
                    })

                    if tool_name == "return_final_answer":
                        return traj
        except Exception as e:
            print(f"Error: {e}")
            return traj

    return traj


print("✅ Rollout function defined!")

✅ Rollout function defined!


In [6]:
import json
import os
from litellm import acompletion

# Load your training data
DATA_FILE = "./snippet_data.json"

print(f"Loading data from {DATA_FILE}...")
with open(DATA_FILE, 'r') as f:
    data = json.load(f)

# Convert to LegalScenario objects
training_scenarios = []
for item in data.get("items", []):
    for row in item.get("rows", []):
        sources = row.get("sources", [])
        gold_part_ids = sources if sources else []
        
        training_scenarios.append(
            LegalScenario(
                id=str(row["row_index"]),
                question=row["question"],
                gold_answer=row.get("model_answer", ""),
                gold_part_ids=gold_part_ids
            )
        )

print(f"✅ Loaded {len(training_scenarios)} scenarios")


# Custom RULER function using OpenRouter
async def gemini_ruler_score_group(group: art.TrajectoryGroup) -> art.TrajectoryGroup:
    """Score trajectories using Gemini 2.5 Flash via OpenRouter"""
    
    trajectories = group.trajectories
    if len(trajectories) <= 1:
        for traj in trajectories:
            traj.reward = 0.0
        return group
    
    # Check for tool errors and assign harsh penalties immediately
    for traj in trajectories:
        messages = traj.messages()
        full_conversation = "\n".join([str(m.get("content", "")) for m in messages])
        
        # Harsh penalty for invalid tool usage
        if "[INVALID PART_ID]" in full_conversation or "[TOOL ERROR]" in full_conversation:
            traj.reward = -2.0  # Format/usage error band
            print(f"  ⚠️  Trajectory penalized: -2.0 (invalid tool usage)")
            continue
        
        # Also check if no answer was provided
        if not any("<answer>" in str(m.get("content", "")) for m in messages):
            traj.reward = -1.0  # Wrong answer band
            print(f"  ⚠️  Trajectory penalized: -1.0 (no answer provided)")
            continue
    
    # For trajectories without errors, use Gemini to compare
    valid_trajectories = [t for t in trajectories if t.reward == 0.0]
    
    if len(valid_trajectories) <= 1:
        return group
    
    # Extract responses from valid trajectories
    responses = []
    for traj in valid_trajectories:
        messages = traj.messages()
        if messages:
            last_msg = messages[-1].get("content", "")
            responses.append(last_msg[:500])
        else:
            responses.append("")
    
    # Build comparison prompt
    comparison_text = "\n\n".join([
        f"**Response {i+1}:**\n{resp}"
        for i, resp in enumerate(responses)
    ])
    
    judge_prompt = f"""Compare these {len(responses)} legal research responses and rank them.

Criteria:
1. Correctness and accuracy (most important)
2. Proper citation of sources with part_ids
3. Completeness of answer

Responses:
{comparison_text}

Return ONLY a JSON array of scores from 0.0 to 2.0, one score per response in order.
Higher scores = better responses.
Example: [2.0, 0.5, 1.5]

Your scores:"""
    
    try:
        # Call Gemini via OpenRouter
        response = await acompletion(
            model="openrouter/google/gemini-2.5-flash",
            messages=[{"role": "user", "content": judge_prompt}],
            api_key=os.environ["OPENROUTER_API_KEY"],
            max_tokens=100,
        )
        
        # Parse scores
        result_text = response.choices[0].message.content.strip()
        
        # Extract JSON array
        import re
        json_match = re.search(r'\[[\d\.,\s]+\]', result_text)
        if json_match:
            scores = json.loads(json_match.group())
        else:
            scores = json.loads(result_text)
        
        # Assign scores to valid trajectories
        for traj, score in zip(valid_trajectories, scores):
            traj.reward = float(score)
        
        print(f"  Scores: {scores}")
        
    except Exception as e:
        print(f"  Error in judge: {e}")
        # Fallback: random variation for valid trajectories only
        import random
        for traj in valid_trajectories:
            traj.reward = random.uniform(0.5, 1.5)
    
    return group


# Test the judge
print("\n🧪 Testing Gemini judge via OpenRouter...")

test_scenario = training_scenarios[0]
base_messages = [
    {"role": "system", "content": "You are a legal research agent."},
    {"role": "user", "content": test_scenario.question},
]

good_traj = art.Trajectory(
    messages_and_choices=[
        *base_messages,
        {"role": "assistant", "content": test_scenario.gold_answer},
    ],
    reward=0,
)

bad_traj = art.Trajectory(
    messages_and_choices=[
        *base_messages,
        {"role": "assistant", "content": "I don't know anything about this legal question."},
    ],
    reward=0,
)

test_group = art.TrajectoryGroup(trajectories=[good_traj, bad_traj])

# Score using custom function
judged_group = await gemini_ruler_score_group(test_group)

# Display results
sorted_trajs = sorted(judged_group.trajectories, key=lambda t: t.reward, reverse=True)
for rank, traj in enumerate(sorted_trajs, 1):
    msgs = traj.messages()
    print(f"\nRank {rank}: Score {traj.reward:.3f}")
    print(f"  Response: {msgs[-1]['content'][:80]}...")

print("\n✅ Gemini judge working!")

Loading data from ./snippet_data.json...
✅ Loaded 100 scenarios

🧪 Testing Gemini judge via OpenRouter...
  ⚠️  Trajectory penalized: -1.0 (no answer provided)
  ⚠️  Trajectory penalized: -1.0 (no answer provided)

Rank 1: Score -1.000
  Response: The Marshall Court reasoned that a land grant from a state constitutes a binding...

Rank 2: Score -1.000
  Response: I don't know anything about this legal question....

✅ Gemini judge working!


In [None]:
from art.utils import iterate_dataset
import wandb
from datetime import datetime

# Initialize W&B with auto-generated run name
wandb.login(key=os.environ["WANDB_API_KEY"])
run = wandb.init(
    project="IBM-Datathon-Z-2025",
    config={
        "model": "Qwen/Qwen2.5-14B-Instruct",
        "groups_per_step": 2,
        "num_epochs": 3,
        "rollouts_per_group": 6,
        "learning_rate": 1e-5,
        "max_steps": 50,
        "max_turns": MAX_TURNS,
    },
    # Remove the name parameter to get auto-generated names with numbers
    # name="legal-rag-rl-training"  # REMOVED THIS LINE
)

# Training config
training_config = run.config

# Create training iterator starting from step 0 (fresh run)
training_iterator = iterate_dataset(
    training_scenarios,
    groups_per_step=training_config["groups_per_step"],
    num_epochs=training_config["num_epochs"],
    initial_step=0,  # CHANGED: Always start from 0 for fresh runs
)

print("🚀 Starting training loop...\n")
print(f"💻 Model inference running on: W&B Serverless (not your A100s)")
print(f"📊 W&B Dashboard: {run.url}")
print(f"🏷️  Run name: {run.name}\n")

step_count = 0

for batch in training_iterator:
    print(f"=== Step {batch.step} | Epoch {batch.epoch} | Epoch Step {batch.epoch_step} ===")
    print(f"Batch: {len(batch.items)} scenarios")
    
    # Create trajectory groups
    groups = []
    for scenario in batch.items:
        groups.append(
            art.TrajectoryGroup(
                (
                    rollout(model, LegalScenarioStep(step=batch.step, scenario=scenario))
                    for _ in range(training_config["rollouts_per_group"])
                )
            )
        )
    
    # Gather trajectories
    finished_groups = await art.gather_trajectory_groups(
        groups,
        pbar_desc="Gathering trajectories",
        max_exceptions=training_config["rollouts_per_group"] * len(batch.items),
    )
    
    # Judge with custom Gemini function
    judged_groups = []
    for group in finished_groups:
        judged_group = await gemini_ruler_score_group(group)
        judged_groups.append(judged_group)
    
    # Calculate metrics before training
    all_rewards = [t.reward for g in judged_groups for t in g.trajectories]
    avg_reward = sum(all_rewards) / len(all_rewards)
    max_reward = max(all_rewards)
    min_reward = min(all_rewards)
    
    # Count trajectories by reward band
    correct_count = sum(1 for r in all_rewards if r >= 1.0)
    idk_count = sum(1 for r in all_rewards if 0.0 <= r < 1.0)
    wrong_count = sum(1 for r in all_rewards if -1.0 <= r < 0.0)
    format_error_count = sum(1 for r in all_rewards if r < -1.0)
    
    # Log to W&B
    wandb.log({
        "step": step_count,  # Use step_count instead of batch.step
        "epoch": batch.epoch,
        "avg_reward": avg_reward,
        "max_reward": max_reward,
        "min_reward": min_reward,
        "correct_count": correct_count,
        "idk_count": idk_count,
        "wrong_count": wrong_count,
        "format_error_count": format_error_count,
        "total_trajectories": len(all_rewards),
    })
    
    print(f"  Rewards: avg={avg_reward:.3f}, max={max_reward:.3f}, min={min_reward:.3f}")
    print(f"  Correct: {correct_count}, IDK: {idk_count}, Wrong: {wrong_count}, Format Errors: {format_error_count}")
    
    # Train on judged trajectories
    await model.delete_checkpoints()
    await model.train(
        judged_groups,
        config=art.TrainConfig(learning_rate=training_config["learning_rate"]),
    )
    
    print(f"✅ Step {step_count} complete\n")
    
    step_count += 1
    
    # Stop after max_steps
    if step_count >= training_config["max_steps"]:
        break

run.finish()
print("🎉 Training complete!")
print(f"📊 View results: {run.url}")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshng2025[0m ([33mImperial-College-London-SPQR[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [litellm, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Weave is installed but not imported. Add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


🚀 Starting training loop...

💻 Model inference running on: W&B Serverless (not your A100s)
📊 W&B Dashboard: https://wandb.ai/Imperial-College-London-SPQR/IBM-Datathon-Z-2025/runs/pmt5yxjw
🏷️  Run name: dashing-firebrand-6



Iterating dataset:   0%|          | 0/150 [00:00<?, ?batch/s]

=== Step 0 | Epoch 0 | Epoch Step 0 ===
Batch: 2 scenarios


Gathering trajectories: 100%|██████████| 12/12 [00:01<00:00,  6.23it/s, reward=0, completion_tokens=76.9]


  Scores: [1.5, 1.0, 1.5, 1.0, 1.5, 1.5]
  Scores: [0.5, 0.5, 0.5, 1.0, 1.5, 0.5]
  Rewards: avg=1.042, max=1.500, min=0.500
  Correct: 8, IDK: 4, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.12s/it, entropy=0.284, grad_norm=0.397, loss=0.64, policy_loss=0.64]
Iterating dataset:   1%|          | 1/150 [00:23<59:26, 23.93s/batch]

✅ Step 0 complete

=== Step 1 | Epoch 0 | Epoch Step 1 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.68it/s, exceptions=2, reward=0, completion_tokens=89]


  Scores: [1.5, 0.5, 1.0, 1.0, 2.0]
  Scores: [1.0, 1.5, 1.0, 2.0, 1.5]
  Rewards: avg=1.300, max=2.000, min=0.500
  Correct: 9, IDK: 1, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.37s/it, entropy=0.292, grad_norm=0.444, loss=0.272, policy_loss=0.272]
Iterating dataset:   1%|▏         | 2/150 [00:50<1:03:31, 25.75s/batch]

✅ Step 1 complete

=== Step 2 | Epoch 0 | Epoch Step 2 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:04<00:00,  2.42it/s, exceptions=2, reward=0, completion_tokens=89.1]


  Scores: [1.0, 1.5, 0.5, 2.0, 1.0]
  Scores: [1.8, 1.0, 1.9, 1.8, 1.7]
  Rewards: avg=1.420, max=2.000, min=0.500
  Correct: 9, IDK: 1, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:15<00:00,  7.72s/it, entropy=0.511, grad_norm=0.35, loss=-0.606, policy_loss=-0.606]
Iterating dataset:   2%|▏         | 3/150 [01:15<1:02:11, 25.39s/batch]

✅ Step 2 complete

=== Step 3 | Epoch 0 | Epoch Step 3 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.93it/s, exceptions=2, reward=0, completion_tokens=85.6]


  Scores: [1.5, 1.5, 1.5, 2.0]
  Scores: [1.0, 1.0, 1.2, 1.2, 1.2, 1.8]
  Rewards: avg=1.390, max=2.000, min=1.000
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.34s/it, entropy=0.384, grad_norm=0.294, loss=0.0259, policy_loss=0.0259]
Iterating dataset:   3%|▎         | 4/150 [01:41<1:02:05, 25.52s/batch]

✅ Step 3 complete

=== Step 4 | Epoch 0 | Epoch Step 4 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.66it/s, exceptions=2, reward=0, completion_tokens=93.5]


  Scores: [1.8, 1.2, 1.7, 1.3, 1.5]
  Scores: [1.5, 1.8, 1.8, 1.5, 1.9]
  Rewards: avg=1.600, max=1.900, min=1.200
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.34s/it, entropy=0.375, grad_norm=0.688, loss=0.747, policy_loss=0.747]
Iterating dataset:   3%|▎         | 5/150 [02:07<1:01:56, 25.63s/batch]

✅ Step 4 complete

=== Step 5 | Epoch 0 | Epoch Step 5 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.85it/s, exceptions=2, reward=0, completion_tokens=97.9]


  Scores: [1.0, 1.0, 1.0, 1.5, 1.0]
  Scores: [1.5, 0.5, 0.5, 1.5, 0.0]
  Rewards: avg=0.950, max=1.500, min=0.000
  Correct: 7, IDK: 3, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:15<00:00,  7.73s/it, entropy=0.535, grad_norm=0.293, loss=-0.283, policy_loss=-0.283]
Iterating dataset:   4%|▍         | 6/150 [02:31<1:00:28, 25.20s/batch]

✅ Step 5 complete

=== Step 6 | Epoch 0 | Epoch Step 6 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:04<00:00,  2.39it/s, exceptions=2, reward=0, completion_tokens=95.5]


  Scores: [1.0, 1.0, 1.0, 1.0]
  Scores: [1.5, 1.7, 0.5, 1.7, 1.7, 0.8]
  Rewards: avg=1.190, max=1.700, min=0.500
  Correct: 8, IDK: 2, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.04s/it, entropy=0.192, grad_norm=0.191, loss=0.177, policy_loss=0.177]
Iterating dataset:   5%|▍         | 7/150 [02:57<1:00:41, 25.46s/batch]

✅ Step 6 complete

=== Step 7 | Epoch 0 | Epoch Step 7 ===
Batch: 2 scenarios


Gathering trajectories:  92%|█████████▏| 11/12 [00:03<00:00,  3.08it/s, exceptions=1, reward=0, completion_tokens=88.8]


  Scores: [1.6, 1.8, 1.7, 1.9, 1.7]
  Scores: [1.7, 1.8, 1.0, 1.5, 1.9, 1.6]
  Rewards: avg=1.655, max=1.900, min=1.000
  Correct: 11, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.07s/it, entropy=0.264, grad_norm=0.638, loss=0.444, policy_loss=0.444]
Iterating dataset:   5%|▌         | 8/150 [03:22<1:00:01, 25.36s/batch]

✅ Step 7 complete

=== Step 8 | Epoch 0 | Epoch Step 8 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.67it/s, exceptions=2, reward=0, completion_tokens=94.3]


  Scores: [1.0, 1.0, 1.0, 1.0]
  Scores: [1.8, 1.0, 1.7, 0.8, 1.9, 2.0]
  Rewards: avg=1.320, max=2.000, min=0.800
  Correct: 9, IDK: 1, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.03s/it, entropy=0.449, grad_norm=0.466, loss=-0.673, policy_loss=-0.674]
Iterating dataset:   6%|▌         | 9/150 [03:48<59:47, 25.45s/batch]  

✅ Step 8 complete

=== Step 9 | Epoch 0 | Epoch Step 9 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.72it/s, exceptions=2, reward=0, completion_tokens=95.2]


  Scores: [1.8, 1.9, 1.8, 1.7, 1.5]
  Scores: [1.6, 1.8, 1.5, 1.7, 1.9]
  Rewards: avg=1.720, max=1.900, min=1.500
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.02s/it, entropy=0.334, grad_norm=0.277, loss=0.0094, policy_loss=0.00945]
Iterating dataset:   7%|▋         | 10/150 [04:13<59:14, 25.39s/batch]

✅ Step 9 complete

=== Step 10 | Epoch 0 | Epoch Step 10 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.69it/s, exceptions=2, reward=0, completion_tokens=71.1]


  Scores: [0.5, 0.5, 1.5, 1.5]
  Scores: [1.0, 1.0, 1.0, 1.5, 0.0, 0.0]
  Rewards: avg=0.850, max=1.500, min=0.000
  Correct: 6, IDK: 4, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:15<00:00,  7.73s/it, entropy=0.264, grad_norm=0.878, loss=0.926, policy_loss=0.926]
Iterating dataset:   7%|▋         | 11/150 [04:38<58:23, 25.21s/batch]

✅ Step 10 complete

=== Step 11 | Epoch 0 | Epoch Step 11 ===
Batch: 2 scenarios


Gathering trajectories:  92%|█████████▏| 11/12 [00:03<00:00,  3.45it/s, exceptions=1, reward=0, completion_tokens=64.5]


  Scores: [0.5, 1.0, 0.5, 1.0, 1.5, 1.5]
  Scores: [1.3, 1.8, 1.0, 1.9, 1.7]
  Rewards: avg=1.245, max=1.900, min=0.500
  Correct: 9, IDK: 2, Wrong: 0, Format Errors: 0


train: 100%|██████████| 1/1 [00:16<00:00, 16.07s/it, entropy=0.318, grad_norm=0.435, loss=0.0427, policy_loss=0.0427]
Iterating dataset:   8%|▊         | 12/150 [05:03<57:48, 25.13s/batch]

✅ Step 11 complete

=== Step 12 | Epoch 0 | Epoch Step 12 ===
Batch: 2 scenarios


Gathering trajectories:  92%|█████████▏| 11/12 [00:03<00:00,  3.19it/s, exceptions=1, reward=0, completion_tokens=78.2]


  Scores: [0.5, 0.5, 1.0, 1.0, 2.0]
  Scores: [1.5, 1.5, 1.7, 1.6, 1.8, 1.0]
  Rewards: avg=1.282, max=2.000, min=0.500
  Correct: 9, IDK: 2, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.34s/it, entropy=0.366, grad_norm=0.921, loss=0.986, policy_loss=0.986]
Iterating dataset:   9%|▊         | 13/150 [05:29<57:48, 25.32s/batch]

✅ Step 12 complete

=== Step 13 | Epoch 0 | Epoch Step 13 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.96it/s, exceptions=2, reward=0, completion_tokens=68]


  Scores: [1.5, 1.0, 1.0, 1.0, 1.0]
  Scores: [1.5, 1.5, 1.0, 1.5, 1.5]
  Rewards: avg=1.250, max=1.500, min=1.000
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 1/1 [00:14<00:00, 14.84s/it, entropy=0.351, grad_norm=0.416, loss=0.0435, policy_loss=0.0435]
Iterating dataset:   9%|▉         | 14/150 [05:53<56:29, 24.92s/batch]

✅ Step 13 complete

=== Step 14 | Epoch 0 | Epoch Step 14 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.66it/s, exceptions=2, reward=0, completion_tokens=81.8]


  Scores: [1.5, 1.7, 1.0, 1.0, 2.0]
  Scores: [0.5, 1.0, 1.0, 1.5, 1.7]
  Rewards: avg=1.290, max=2.000, min=0.500
  Correct: 9, IDK: 1, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.36s/it, entropy=0.36, grad_norm=3.39, loss=0.586, policy_loss=0.586]
Iterating dataset:  10%|█         | 15/150 [06:19<57:08, 25.39s/batch]

✅ Step 14 complete

=== Step 15 | Epoch 0 | Epoch Step 15 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.78it/s, exceptions=2, reward=0, completion_tokens=88.5]


  Scores: [1.0, 1.5, 1.0, 1.0, 2.0]
  Scores: [1.8, 1.9, 1.7, 2.0, 1.6]
  Rewards: avg=1.550, max=2.000, min=1.000
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:15<00:00,  7.75s/it, entropy=0.449, grad_norm=3.69, loss=-0.261, policy_loss=-0.261]
Iterating dataset:  11%|█         | 16/150 [06:44<56:28, 25.28s/batch]

✅ Step 15 complete

=== Step 16 | Epoch 0 | Epoch Step 16 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.65it/s, exceptions=2, reward=0, completion_tokens=91]


  Scores: [1.0, 1.5, 1.5, 1.0, 1.5]
  Scores: [2.0, 1.5, 1.0, 1.5, 1.5]
  Rewards: avg=1.400, max=2.000, min=1.000
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.04s/it, entropy=0.499, grad_norm=0.953, loss=0.0712, policy_loss=0.0712]
Iterating dataset:  11%|█▏        | 17/150 [07:12<57:35, 25.98s/batch]

✅ Step 16 complete

=== Step 17 | Epoch 0 | Epoch Step 17 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.75it/s, exceptions=2, reward=0, completion_tokens=91.9]


  Scores: [1.0, 1.5, 1.0, 1.0, 1.0]
  Scores: [1.5, 0.5, 1.5, 1.0, 1.0]
  Rewards: avg=1.100, max=1.500, min=0.500
  Correct: 9, IDK: 1, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.04s/it, entropy=0.429, grad_norm=0.922, loss=0.937, policy_loss=0.937]
Iterating dataset:  12%|█▏        | 18/150 [07:37<56:25, 25.65s/batch]

✅ Step 17 complete

=== Step 18 | Epoch 0 | Epoch Step 18 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.81it/s, exceptions=2, reward=0, completion_tokens=81.3]


  Scores: [1.4, 1.4, 1.4, 1.6, 1.4, 1.8]
  Scores: [1.2, 1.8, 1.0, 1.5]
  Rewards: avg=1.450, max=1.800, min=1.000
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.35s/it, entropy=0.343, grad_norm=0.55, loss=0.42, policy_loss=0.42]
Iterating dataset:  13%|█▎        | 19/150 [08:03<56:19, 25.80s/batch]

✅ Step 18 complete

=== Step 19 | Epoch 0 | Epoch Step 19 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.89it/s, exceptions=2, reward=0, completion_tokens=80.3]


  Scores: [1.5, 1.0, 2.0, 1.0]
  Scores: [1.0, 1.5, 1.0, 1.5, 2.0, 1.0]
  Rewards: avg=1.350, max=2.000, min=1.000
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.05s/it, entropy=0.397, grad_norm=0.668, loss=0.322, policy_loss=0.322]
Iterating dataset:  13%|█▎        | 20/150 [08:28<55:26, 25.59s/batch]

✅ Step 19 complete

=== Step 20 | Epoch 0 | Epoch Step 20 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.89it/s, exceptions=2, reward=0, completion_tokens=88.1]


  Error in judge: Expecting value: line 1 column 1 (char 0)
  Scores: [1.8, 1.2, 0.8, 1.5, 1.7]
  Rewards: avg=1.160, max=1.800, min=0.675
  Correct: 5, IDK: 5, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.35s/it, entropy=0.443, grad_norm=0.526, loss=-0.544, policy_loss=-0.544]
Iterating dataset:  14%|█▍        | 21/150 [08:54<55:27, 25.80s/batch]

✅ Step 20 complete

=== Step 21 | Epoch 0 | Epoch Step 21 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.75it/s, exceptions=2, reward=0, completion_tokens=49.9]


  Scores: [0.5, 0.5, 1.0, 0.5, 1.5, 1.0]
  Scores: [0.5, 0.5, 0.5, 1.5]
  Rewards: avg=0.800, max=1.500, min=0.500
  Correct: 4, IDK: 6, Wrong: 0, Format Errors: 0


train: 100%|██████████| 1/1 [00:15<00:00, 15.46s/it, entropy=0.215, grad_norm=0.299, loss=0.0723, policy_loss=0.0723]
Iterating dataset:  15%|█▍        | 22/150 [09:19<54:17, 25.45s/batch]

✅ Step 21 complete

=== Step 22 | Epoch 0 | Epoch Step 22 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.82it/s, exceptions=2, reward=0, completion_tokens=93.2]


  Scores: [1.5, 1.5, 1.8, 1.9, 1.0, 1.0]
  Scores: [1.1, 1.2, 1.4, 1.8]
  Rewards: avg=1.420, max=1.900, min=1.000
  Correct: 10, IDK: 0, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.33s/it, entropy=0.427, grad_norm=1.5, loss=0.263, policy_loss=0.263]
Iterating dataset:  15%|█▌        | 23/150 [09:45<54:03, 25.54s/batch]

✅ Step 22 complete

=== Step 23 | Epoch 0 | Epoch Step 23 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.93it/s, exceptions=2, reward=0, completion_tokens=79.8]


  Scores: [1.5, 1.0, 0.5, 0.5, 0.5, 1.0]
  Scores: [1.7, 1.8, 1.9, 2.0]
  Rewards: avg=1.240, max=2.000, min=0.500
  Correct: 7, IDK: 3, Wrong: 0, Format Errors: 0


train: 100%|██████████| 2/2 [00:16<00:00,  8.04s/it, entropy=0.314, grad_norm=0.961, loss=-1.15, policy_loss=-1.15]
Iterating dataset:  16%|█▌        | 24/150 [10:10<53:15, 25.36s/batch]

✅ Step 23 complete

=== Step 24 | Epoch 0 | Epoch Step 24 ===
Batch: 2 scenarios


Gathering trajectories:  83%|████████▎ | 10/12 [00:03<00:00,  2.66it/s, exceptions=2, reward=0, completion_tokens=93.3]


  Scores: [1.5, 1.8, 1.8, 1.9, 1.9]
  Scores: [1.8, 0.5, 1.5, 1.5, 1.8]
  Rewards: avg=1.600, max=1.900, min=0.500
  Correct: 9, IDK: 1, Wrong: 0, Format Errors: 0


