In [None]:
!pip install openpipe-art==0.5.0 langchain-core tenacity datasets vllm faiss-cpu chromadb requests lxml numpy transformers torch gql==3.4.1 peft

Collecting openpipe-art==0.5.0
  Downloading openpipe_art-0.5.0-py3-none-any.whl.metadata (14 kB)
Collecting langchain-core
  Downloading langchain_core-0.3.79-py3-none-any.whl.metadata (3.2 kB)
Collecting tenacity
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting vllm
  Downloading vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl.metadata (17 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting lxml
  Downloading lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.6 kB)
Collecting gql==3.4.1
  Downloading gql-3.4.1-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting litellm==1.74.1 (from openpipe-art==0.5.0)
  Downloading litellm-1.74.1-py3-no

In [None]:
import os
from secretsConfig import oaiKey, wandbKey  # Import the variables

# Required for RULER judge model
os.environ["OPENAI_API_KEY"] = oaiKey

# Required for Weights & Biases
os.environ["WANDB_API_KEY"] = wandbKey

if not os.environ.get("OPENAI_API_KEY"):
    raise ValueError(
        "OPENAI_API_KEY is required for RULER functionality when using openai/o4-mini."
    )

if not os.environ.get("WANDB_API_KEY"):
    raise ValueError("WANDB_API_KEY is required for inference, training, and logging to Weights & Biases.")

ModuleNotFoundError: No module named 'secretsConfig'

In [None]:
from IBM_Z_Datathon_RAG.semantic_search import FAISSSemanticSearch
from IBM_Z_Datathon_RAG.KeywordSearch import keyword_search
from IBM_Z_Datathon_RAG.ReadDocumentPart import read_document_part



In [None]:
from dotenv import load_dotenv
import random

import art
from art.serverless.backend import ServerlessBackend

load_dotenv()

random.seed(42)

# Declare the model - CHANGED TO QWEN3-14B
model = art.TrainableModel(
    name="legal-agent-001",
    project="legal-rag",
    base_model="Qwen/Qwen2.5-14B-Instruct",  # Changed from Qwen2.5-14B-Instruct
)

# Initialize the server
# Training and inference will run on Weights & Biases servers
backend = ServerlessBackend()

# Register the model with the Serverless Backend (sets up logging, inference, and training)
await model.register(backend)

In [None]:
import os
import json
import random
from textwrap import dedent
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from openai import AsyncOpenAI
from litellm import acompletion
from langchain_core.utils.function_calling import convert_to_openai_tool
from tenacity import retry, stop_after_attempt
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Your tool imports
from IBM_Z_Datathon_RAG.semantic_search import FAISSSemanticSearch
from IBM_Z_Datathon_RAG.KeywordSearch import keyword_search
from IBM_Z_Datathon_RAG.ReadDocumentPart import read_document_part

load_dotenv()

# Get API keys
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")

if not OPENROUTER_API_KEY:
    raise ValueError("Please set OPENROUTER_API_KEY in your .env file")
if not WANDB_API_KEY:
    raise ValueError("Please set WANDB_API_KEY in your .env file")

# Login to W&B
wandb.login(key=WANDB_API_KEY)

MAX_TURNS = 4


class FinalAnswer(BaseModel):
    answer: str = Field(description="The final answer to the legal question")
    source_ids: list[str] = Field(description="List of part IDs used as sources")


class LegalScenario(BaseModel):
    id: str
    question: str
    gold_answer: str | None = None
    gold_doc_ids: list[str] | None = None
    gold_part_ids: list[str] | None = None


class CorrectnessJudgeResponse(BaseModel):
    reasoning: str = Field(description="Explanation of the reasoning process.")
    verdict: str = Field(description="correct, incorrect, or idk")
    reward: float = Field(description="Numeric reward value")


@retry(stop=stop_after_attempt(3))
async def judge_correctness(
    scenario: LegalScenario, 
    answer: str,
    sources: list[str],
    trajectory_info: dict
) -> CorrectnessJudgeResponse:
    """Judge using Gemini 2.5 Flash via OpenRouter"""
    
    system_prompt = dedent(
        """
        You are evaluating a legal research agent's answer. Grade based on:
        
        **Correctness (primary):**
        - Does the answer correctly address the legal question?
        - Does it match the reference answer's key legal points?
        
        **Sources (grounding):**
        - Are the cited source part_ids relevant and correct?
        - Did the agent read the right documents?
        
        **Efficiency:**
        - Did the agent find the answer efficiently (fewer turns/searches = better)?
        
        Return verdict as: "correct", "incorrect", or "idk" (if agent correctly said "I don't know")
        
        Reward scale:
        - Correct answer: +1.0 to +2.3
        - I don't know (when appropriate): 0.0 to +0.8
        - Wrong answer: -1.0 to 0.0
        - Formatting errors: -2.0 to -1.0
        """
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": (
                f"Question: {scenario.question}\n\n"
                f"Reference answer: {scenario.gold_answer}\n\n"
                f"Agent's answer: {answer}\n\n"
                f"Agent's cited sources: {sources}\n\n"
                f"Gold part IDs: {scenario.gold_part_ids}\n\n"
                f"Trajectory info:\n"
                f"- Number of turns: {trajectory_info.get('num_turns', 0)}\n"
                f"- Number of searches: {trajectory_info.get('num_searches', 0)}\n"
                f"- Found right doc: {trajectory_info.get('found_right_doc', False)}\n"
                f"- Read right part: {trajectory_info.get('read_right_part', False)}\n\n"
                "Provide your evaluation with reasoning, verdict, and numeric reward."
            ),
        },
    ]

    response = await acompletion(
        model="google/gemini-2.5-flash",
        messages=messages,
        response_format=CorrectnessJudgeResponse,
        api_base="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY,
    )

    first_choice = response.choices[0]
    raw_content = first_choice.message.content or "{}"

    try:
        return CorrectnessJudgeResponse.model_validate_json(raw_content)
    except Exception as e:
        return CorrectnessJudgeResponse(
            reasoning=f"Parse error: {e}",
            verdict="incorrect",
            reward=-2.0
        )


class Trajectory:
    def __init__(self):
        self.messages = []
        self.final_answer: FinalAnswer | None = None
        self.reward: float = 0.0
        self.num_turns: int = 0
        self.num_searches: int = 0
        self.found_right_doc: bool = False
        self.read_right_part: bool = False
        self.metadata: dict = {}


async def rollout(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    scenario: LegalScenario,
    step: int
) -> Trajectory:
    """Execute one trajectory rollout"""
    
    traj = Trajectory()
    traj.metadata = {
        "scenario_id": scenario.id,
        "step": step,
    }

    system_prompt = dedent(
        f"""
        You are a legal research agent with access to tools for searching legal documents.
        
        Available tools:
        - search_keyword(query, num): Exact term matching (BM25)
        - search_semantic(query, num): Semantic/conceptual search
        - read_document_part(part_id): Read full text of a document part
        - return_final_answer(answer, source_ids): Provide final answer with citations
        
        Rules:
        1. You have up to {MAX_TURNS} turns to find the answer
        2. Use tools strategically - start broad, then narrow down
        3. ALWAYS cite your sources using part_ids
        4. If you cannot find sufficient information, say "I don't know" rather than guessing
        
        Format your tool calls as:
        <tool>{{"name": "tool_name", "args": {{"param": "value"}}}}</tool>
        
        Format your final answer as:
        <answer>Your answer here</answer>
        <sources><source>part_id_1</source><source>part_id_2</source></sources>
        """
    )

    traj.messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": scenario.question},
    ]

    # Define tools
    def search_keyword_tool(query: str, num: int = 5) -> str:
        """Search using keyword/BM25 for exact term matches."""
        traj.num_searches += 1
        return keyword_search(query, num)

    def search_semantic_tool(query: str, num: int = 5) -> str:
        """Search using semantic/vector search."""
        traj.num_searches += 1
        searcher = FAISSSemanticSearch()
        return searcher.search(query, num)

    def read_document_part_tool(part_id: str) -> str:
        """Read a document part by ID."""
        result = read_document_part(part_id)
        
        # Check if we read the right part
        if scenario.gold_part_ids and part_id in scenario.gold_part_ids:
            traj.read_right_part = True
        
        # Check if we found the right document
        if scenario.gold_doc_ids:
            doc_id = part_id.split(':')[0]
            if doc_id in scenario.gold_doc_ids:
                traj.found_right_doc = True
        
        return result

    def return_final_answer(answer: str, source_ids: list[str]) -> FinalAnswer:
        """Return final answer with sources."""
        return FinalAnswer(answer=answer, source_ids=source_ids)

    tools = [
        search_keyword_tool,
        search_semantic_tool,
        read_document_part_tool,
        return_final_answer
    ]
    tools_by_name = {t.__name__: t for t in tools}

    # Agent loop (simplified - you'll implement full generation)
    for turn in range(MAX_TURNS):
        traj.num_turns = turn + 1
        
        # TODO: Implement proper tool-calling generation here
        # For now this is a placeholder
        
        if traj.final_answer:
            break
    
    # Calculate reward using Gemini judge
    if traj.final_answer:
        trajectory_info = {
            "num_turns": traj.num_turns,
            "num_searches": traj.num_searches,
            "found_right_doc": traj.found_right_doc,
            "read_right_part": traj.read_right_part,
        }
        
        judge_response = await judge_correctness(
            scenario,
            traj.final_answer.answer,
            traj.final_answer.source_ids,
            trajectory_info
        )
        
        traj.reward = judge_response.reward
        
        # Log to W&B
        wandb.log({
            "reward": traj.reward,
            "verdict": judge_response.verdict,
            "num_turns": traj.num_turns,
            "num_searches": traj.num_searches,
            "found_right_doc": traj.found_right_doc,
            "read_right_part": traj.read_right_part,
        })
    else:
        traj.reward = -1.0
        wandb.log({"reward": traj.reward, "status": "no_answer"})
    
    return traj


# Main training function
async def train_grpo():
    """Main GRPO training loop with W&B logging"""
    
    # Initialize W&B run
    config = {
        "model": "Qwen/Qwen2.5-14B-Instruct",
        "lora_r": 32,
        "lora_alpha": 32,
        "max_turns": MAX_TURNS,
        "num_epochs": 3,
        "batch_size": 8,
        "learning_rate": 1e-5,
    }
    
    run = wandb.init(
        project="legal-rag-grpo",
        config=config,
        name="qwen2.5-14b-lora-rl"
    )
    
    # Load model
    print("Loading model...")
    model_name = config["model"]
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    
    # Apply LoRA
    print("Applying LoRA...")
    lora_config = LoraConfig(
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # Load scenarios
    print("Loading scenarios...")
    scenarios = []
    # TODO: Load your JSON data here
    
    print(f"✅ Setup complete! Training on {len(scenarios)} scenarios")
    
    # Training loop
    for epoch in range(config["num_epochs"]):
        print(f"\n=== Epoch {epoch + 1}/{config['num_epochs']} ===")
        
        epoch_rewards = []
        
        for i, scenario in enumerate(scenarios):
            # Run trajectory
            traj = await rollout(model, tokenizer, scenario, step=i)
            epoch_rewards.append(traj.reward)
            
            # TODO: Update model with GRPO based on trajectory
            
            if (i + 1) % 10 == 0:
                avg_reward = sum(epoch_rewards) / len(epoch_rewards)
                print(f"Batch {i + 1}/{len(scenarios)} | Avg Reward: {avg_reward:.3f}")
        
        # Log epoch metrics
        avg_epoch_reward = sum(epoch_rewards) / len(epoch_rewards)
        wandb.log({
            "epoch": epoch + 1,
            "epoch_avg_reward": avg_epoch_reward,
        })
        
        print(f"Epoch {epoch + 1} | Avg Reward: {avg_epoch_reward:.3f}")
    
    # Save final model
    model.save_pretrained("./legal-rag-lora-final")
    wandb.save("./legal-rag-lora-final/*")
    
    run.finish()
    print("✅ Training complete!")

ModuleNotFoundError: No module named 'peft'

In [None]:
if __name__ == "__main__":
    import asyncio
    asyncio.run(train_grpo())