In [None]:
import asyncio
import re
import json

In [None]:
import os
import torch
import openai
from openai import OpenAI
import abc
from transformers import AutoTokenizer, AutoModelForCausalLM
import math
from concurrent.futures import ThreadPoolExecutor
import requests
#from mistralai import Mistral
from itertools import islice
import time
from typing import List
#from anthropic import Client as AnthropicClient
#import anthropic
import threading
from collections import deque
from concurrent.futures import ThreadPoolExecutor

def chunkify(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

# Base abstract class for model pipelines.
class ModelPipelineBase(abc.ABC):
    @abc.abstractmethod
    def call_model(self, prompt: str) -> str:
        """Send a prompt to the model and return its response."""
        pass
    def call_model_batch(self, prompts: list[str], batch_size: int = 16) -> list[str]:
        """
        Split `prompts` into chunks of size batch_size, then call `call_model`
        on each chunk in parallel threads.
        """
        results = []
        for chunk in chunkify(prompts, batch_size):
            with ThreadPoolExecutor(max_workers=len(chunk)) as ex:
                futures = [ex.submit(self.call_model, p) for p in chunk]
                for f in futures:
                    results.append(f.result())
        return results




from openai import OpenAI, APIError

class LLMAPIPipeline(ModelPipelineBase):
    """
    OpenAI-compatible pipeline for https://api.llmapi.com.
    Rate-limited to 200 requests/min. Multiple completions (n>1) count as
    multiple requests.
    """
    def __init__(
        self,
        model: str = "llama3.1-70b",
        temperature: float = 0.0,
        max_tokens: int = 256,
        top_p: float | None = None,
    ):
        self.client = OpenAI(
            api_key = os.getenv("LLMAPI_TOKEN"),
            base_url = LLMAPI_BASE_URL      # OpenAI client supports custom base_url  :contentReference[oaicite:3]{index=3}
        )
        if not self.client.api_key:
            raise ValueError("LLMAPI_TOKEN env var or api_key param is required.")
        self.model        = model
        self.temperature  = temperature
        self.max_tokens   = max_tokens
        self.top_p        = top_p

    # ------------------------------------------------------------------
    # Single prompt → single completion
    # ------------------------------------------------------------------
    def call_model(self, prompt: str, n: int = 1) -> str | list[str]:
        """Wrap the direct client call in up to 3 retries on APIError."""
        attempts = 0
        while True:
            try:
                completion = self.client.chat.completions.create(
                    model       = self.model,
                    messages    = [{"role": "user", "content": prompt}],
                    temperature = self.temperature,
                    max_tokens  = self.max_tokens,
                    n           = n,
                    top_p       = self.top_p,
                )
                texts = [c.message.content for c in completion.choices]
                return texts[0] if n == 1 else texts

            except APIError as e:
                attempts += 1
                if attempts >= 10:
                    # give up after 3 tries
                    raise
                # back off and retry
                time.sleep(1)

    def call_model_batch(self, prompts: list[str], batch_size: int = 16, n: int = 1) -> list[str]:
        """
        Parallel batch calls with:
          - 200 req/min rate limit
          - up to 3 retries on any APIError
        """
        timestamps = deque()
        lock = threading.Lock()

        def rate_limit(count: int = 1):
            with lock:
                now = time.time()
                while timestamps and now - timestamps[0] > 60:
                    timestamps.popleft()
                if len(timestamps) + count > 200:
                    sleep_for = 60 - (now - timestamps[0]) + 0.05
                    time.sleep(sleep_for)
                    now = time.time()
                    while timestamps and now - timestamps[0] > 60:
                        timestamps.popleft()
                for _ in range(count):
                    timestamps.append(now)

        def safe_call(prompt: str, n: int):
            attempts = 0
            while True:
                rate_limit(count=n)
                try:
                    result = self.call_model(prompt, n=n)
                    return [result] if n == 1 else result
                except APIError:
                    attempts += 1
                    print(f"APIError on prompt: {prompt}. Retrying... (attempt {attempts}/3)")
                    if attempts >= 10:
                        raise
                    time.sleep(1)

        results = []
        for chunk in chunkify(prompts, batch_size):
            with ThreadPoolExecutor(max_workers=len(chunk)) as ex:
                futures = [ex.submit(safe_call, p, n) for p in chunk]
                for f in futures:
                    results.extend(f.result())

        return results
# Factory class to create the appropriate pipeline.
class ModelPipelineFactory:
    @staticmethod
    def create_pipeline(source: str, **kwargs) -> ModelPipelineBase:
        """
        Factory method to instantiate a pipeline based on the provided source.
        :param source: A string indicating the source, e.g., "huggingface", "local", "api", or "gpt".
        :param kwargs: Additional parameters for pipeline initialization.
            For HuggingFaceDetailedPipeline:
              - model_name (str): Required.
              - cache_dir (str): Optional; if not provided and not in environment, no cache_dir is used.
              - torch_dtype (torch.dtype): Optional; default torch.float16.
              - temperature (float): Optional; default 0.0.
              - device (int): Optional; default -1 (CPU).
              - max_length (int): Optional; default 150.
            For GPTAPIPipeline:
              - model (str): Optional; default "gpt-3.5-turbo".
              - temperature (float): Optional; default 0.0.
              - max_tokens (int): Optional; default 150.
        :return: An instance of ModelPipelineBase.
        """
        source = source.lower()
        if source in {"huggingface", "local"}:
            if "model_name" not in kwargs:
                raise ValueError("model_name must be provided for HuggingFace pipeline.")
            return HuggingFaceDetailedPipeline(
                model_name=kwargs["model_name"],
                cache_dir=kwargs.get("cache_dir"),
                torch_dtype=kwargs.get("torch_dtype", torch.float16),
                temperature=kwargs.get("temperature", 0.0),
                device=kwargs.get("device", -1),
                max_length=kwargs.get("max_length", 150)
            )
        elif source in {"api", "gpt"}:
            return GPTAPIPipeline(
                model=kwargs.get("model", "gpt-3.5-turbo"),
                temperature=kwargs.get("temperature", 0.0),
                max_tokens=kwargs.get("max_tokens", 150)
            )
        elif source == "mistral":
            return MistralPipeline(
                api_key=kwargs.get("api_key"),
                model=kwargs.get("model", "mistral-7b"),
                temperature=kwargs.get("temperature", 0.0),
            )
        elif source in {"anthropic", "claude_sonnet", "sonnet"}:
            return ClaudeSonnetPipeline(
                model=kwargs.get("model", "claude-3-7-sonnet-20250219"),
                temperature=kwargs.get("temperature", 0.0),
                max_tokens=kwargs.get("max_tokens", 1024),
            )
        elif source in {"llmapi", "llama_api"}:
            return LLMAPIPipeline(
                model        = kwargs.get("model", "llama3.1-70b"),
                temperature  = kwargs.get("temperature", 0.0),
                max_tokens   = kwargs.get("max_tokens", 256),
                top_p        = kwargs.get("top_p", None),
            )
        else:
            raise ValueError(f"Unsupported source: {source}")

In [None]:
def assemble_initial_prompt(transcript_dict: dict, with_context: bool, agent: str) -> str:
    """
    Assemble a prompt instructing the agent to generate its INITIAL answer.

    The prompt includes the question and, if requested, the context.
    It then uses the appropriate template from DEBATER, replacing placeholders
    for QUESTION, CONTEXT, AGENT, and leaves {ANSWER} as a marker.

    Parameters:
      transcript_dict (dict): Should contain at least "question" and optionally "context".
      with_context (bool): If True, use the template including context.
      agent (str): The agent identifier (e.g., "A" or "B").

    Returns:
      A formatted prompt string.
    """
    question = transcript_dict.get("question", "")

    if with_context:
        context = transcript_dict.get("context", "")
        template = DEBATER.get("initial_answer_prompt_with_context", "")
        prompt = template.format(QUESTION=question, CONTEXT=context)
    else:
        template = DEBATER.get("initial_answer_prompt_without_context", "")
        prompt = template.format(QUESTION=question)

    return prompt


def assemble_transcript(
    transcript_dict: dict,
    num_rounds: int,
    with_context: bool,
    agent: str,
    history_mode: str,   # "full_history" or "last_round_history"
    debate_type: str,    # "self" or "both"
    debate_mode: str,    # "asymmetric" or "symmetric"
    role: str = None     # For asymmetric mode: "challenger" or "defender"
) -> str:
    """
    Assemble a prompt for a debater.

    Parameters:
      transcript_dict: contains "question", "context", initial answers, and round responses.
      num_rounds:      number of rounds completed so far.
      with_context:    whether to include the context block.
      agent:           "A" or "B".
      history_mode:    "full_history" or "last_round_history".
      debate_type:     "self" (only own initial) or "both" (both agents' initials).
      debate_mode:     "asymmetric" or "symmetric".
      role:            for asymmetric: "challenger" or "defender"; ignored if symmetric.

    Returns:
      A fully formatted prompt string for the agent.
    """
    lines = []

    # 1) Pick the right instruction block based on symmetric/asymmetric + role
    if debate_mode.lower() == "asymmetric":
        if role is None:
            instruction = "Error: Asymmetric debate mode requires a role."
        elif role.lower() == "challenger":
            instruction = DEBATER["debater_ch_instru"]
        elif role.lower() == "defender":
            instruction = DEBATER["debater_de_instru"]
        else:
            instruction = "Error: Unknown role provided."
    elif debate_mode.lower() == "symmetric":
        instruction = DEBATER["debater_sym_instru"]
    else:
        instruction = "Error: Unknown debate mode provided."

    # 1.a) Prepend context‐reliability guidance
    context_trust = DEBATER.get("context_trust_instru", "")
    instruction = f"{context_trust}\n\n{instruction}"
    lines.append("Instruction:")
    lines.append(instruction)
    lines.append("────────────────────────────")

    # 2) Question and optional context
    question = transcript_dict.get("question", "")
    lines.append(f"Question: {question}")

    if with_context:
        context = transcript_dict.get("context", "")
        if context:
            lines.append(f"Context: {context}")
    lines.append("")

    # 3) If no rounds yet, ask for an initial answer
    if num_rounds == 0:
        lines.append(f"Agent {agent}, please provide your INITIAL ANSWER based on the above information.")
        return "\n".join(lines)

    # 4) Show initial answers
    lines.append("Initial Answer(s):")
    if debate_type.lower() == "self":
        key = f"debater_{agent}_initial"
        lines.append(f"  Agent {agent}: {transcript_dict.get(key,'')}")
    elif debate_type.lower() == "both":
        lines.append(f"  Agent A: {transcript_dict.get('debater_A_initial','')}")
        lines.append(f"  Agent B: {transcript_dict.get('debater_B_initial','')}")
    else:
        lines.append("Error: Unknown debate_type provided.")
    lines.append("")

    # 5) Show previous round(s)
    if history_mode.lower() == "full_history":
        lines.append("Previous Round Responses (Full History):")
        for r in range(1, num_rounds):
            lines.append(f"  Round {r}:")
            lines.append(f"    Agent A: {transcript_dict.get(f'debater_A_round_{r}','')}")
            lines.append(f"    Agent B: {transcript_dict.get(f'debater_B_round_{r}','')}")
    elif history_mode.lower() == "last_round_history":
        r = num_rounds
        lines.append("Previous Round Responses (Last Round):")
        lines.append(f"  Round {r}:")
        lines.append(f"    Agent A: {transcript_dict.get(f'debater_A_round_{r}','')}")
        lines.append(f"    Agent B: {transcript_dict.get(f'debater_B_round_{r}','')}")
    else:
        lines.append("Error: Unknown history_mode provided.")
    lines.append("")

    # 6) End‐of‐round thinking prompt
    if num_rounds == 1:
        end_instr = DEBATER["first_round_thinking"]
    elif num_rounds == 2:
        end_instr = DEBATER["second_round_thinking"]
    else:
        end_instr = DEBATER["nth_round_thinking"]

    lines.append("End Instruction:")
    lines.append(end_instr)
    lines.append("")
    lines.append(f"Agent {agent}, please provide your NEXT RESPONSE based on the above conversation.")

    return "\n".join(lines)

def assemble_judge_prompt(
    transcript_dict: dict,
    num_rounds: int,
    with_context: bool,
    judge_type: str = "symmetric"
) -> str:
    """
    Assemble a prompt for the judge agent based on the debate transcript dictionary.

    The prompt includes:
      - A judge-specific instruction from JUDGE (selected based on judge_type),
      - The complete debate transcript, including the question, (optionally) context, initial answers, and all round responses.
      - A final instruction to generate the final decision, which is provided by the JUDGE template.

    Parameters:
      transcript_dict (dict): Contains "question", "context", initial answers, and round responses.
      num_rounds (int): Number of completed debate rounds.
      with_context (bool): Whether to include context.
      judge_type (str): "symmetric" or "asymmetric" to select the appropriate judge instruction.

    Returns:
      A formatted prompt string for the judge.
    """
    lines = []

    # Select judge instruction.
    if judge_type.lower() == "symmetric":
        judge_instruction = JUDGE.get("judge_sym_instru", "")
    elif judge_type.lower() == "asymmetric":
        judge_instruction = JUDGE.get("judge_asym_instru", "")
    else:
        judge_instruction = "Error: Unknown judge type provided."

    lines.append("Judge Instruction:")
    lines.append(judge_instruction)
    lines.append("────────────────────────────")

    # Add the question.
    question = transcript_dict.get("question", "")
    lines.append("Question: {QUESTION}".format(QUESTION=question))

    # Optionally, add context.
    if with_context:
        context = transcript_dict.get("context", "")
        if context:
            lines.append("Context: {CONTEXT}".format(CONTEXT=context))
    lines.append("")

    # Add initial answers.
    lines.append("Initial Answers:")
    lines.append("  Agent A: " + transcript_dict.get("debater_A_initial", ""))
    lines.append("  Agent B: " + transcript_dict.get("debater_B_initial", ""))
    lines.append("")

    # Add round responses (full history is preferred for the judge).
    if num_rounds > 0:
        lines.append("Round Responses:")
        for r in range(1, num_rounds):
            lines.append("  Round {r}:".format(r=r))
            lines.append("    Agent A: " + transcript_dict.get(f"debater_A_round_{r}", ""))
            lines.append("    Agent B: " + transcript_dict.get(f"debater_B_round_{r}", ""))
    else:
        lines.append("No debate rounds have been completed yet.")
    lines.append("")

    # Use the final decision prompt from JUDGE.
    final_instr = JUDGE.get("final_decision_prompt", "")
    lines.append(final_instr)

    return "\n".join(lines)

In [None]:
import re
from typing import Optional

class Debater:
    def __init__(
        self,
        name: str,
        pipeline: ModelPipelineBase,
        has_context: bool = False,
        context: Optional[str] = None,
        min_words: int = 5,
        max_words: int = 50,
        debate_mode: str = "symmetric",
        debate_type: str = "both",
        history_mode: str = "full_history",
        role: Optional[str] = None,
    ):
        self.name          = name
        self.pipeline      = pipeline
        self.has_context   = has_context
        self.context       = context
        self.min_words     = min_words
        self.max_words     = max_words
        self.debate_mode   = debate_mode       # "symmetric" or "asymmetric"
        self.debate_type = debate_type     # "self" or "both"
        self.history_mode  = history_mode      # "full_history" or "last_round_history"
        self.role          = role              # only used if asym

    def get_initial_answer(self, transcript_dict: dict) -> str:
        prompt  = assemble_initial_prompt(transcript_dict,
                                         with_context=self.has_context,
                                         agent=self.name)
        response = self.pipeline.call_model(prompt)
        # extract after "answer:"
        #m = re.search(r"answer:\s*(.*)", response, re.IGNORECASE)
        #return m.group(1).strip() if m else response
        return response

    def debate_response(self, transcript_dict: dict, round_num: int) -> str:
        prompt = assemble_transcript(
            transcript_dict,
            num_rounds=round_num,
            with_context=self.has_context,
            agent=self.name,
            history_mode=self.history_mode,
            debate_type=self.debate_type,   # “self” or “both”
            debate_mode=self.debate_mode,   # “symmetric” or “asymmetric”
            role=self.role,
        )
        return self.pipeline.call_model(prompt)

    def extract_argument(self, response: str) -> str:
        """
        Extract the text within <argument>...</argument> tags.
        """
        match = re.search(r"<argument>(.*?)</argument>", response)
        if match:
            return match.group(1).strip()
        return response.strip()

    def truncate(self, argument: str) -> str:
        """
        Truncate the argument to the maximum word count.
        """
        words = argument.split()
        if len(words) > self.max_words:
            truncated = " ".join(words[:self.max_words]) + "... <TRUNCATED>"
            return truncated
        return argument


class Judge:
    def __init__(self, name: str, pipeline: ModelPipelineBase):
        """
        Initialize the Judge.
        """
        self.name = name
        self.pipeline = pipeline

    def decide(self, transcript: str) -> str:
        """
        Generate a final decision based on the debate transcript.
        Uses the appropriate judge instruction from the JUDGE template.
        """
        prompt = JUDGE.get("judge_sym_instru", "").format(name=self.name, transcript=transcript)
        response = self.pipeline.call_model(prompt)
        decision = self.extract_clean_argument(response)
        return f"[{self.name}] Final Decision: {decision}"

    def extract_clean_argument(self, response: str) -> str:
        """
        Extract the text within <argument>...</argument> tags from the judge's response.
        """
        match = re.search(r"<argument>(.*?)</argument>", response)
        if match:
            return match.group(1).strip()
        return response.strip()

### Basic Debate

#### llama

In [None]:
# 1) Load the 600-item mixed dataset (300 standard + 300 perturbed)
with open("sc_mad_mixed_600.json") as f:
    tasks = json.load(f)

In [None]:
import re
import json

# 1) Instantiate pipelines & agents once
batch_size = 150

pipelineA = ModelPipelineFactory.create_pipeline(
    source="llmapi",                 # ← was "api"
    model="llama3.3-70b",           # any model name LLM API hosts
    temperature=0.0,
    max_tokens=200,
)

pipelineB = ModelPipelineFactory.create_pipeline(
    source="llmapi",
    model="llama3.3-70b",
    temperature=0.0,
    max_tokens=200,
)

pipelineJ = ModelPipelineFactory.create_pipeline(
    source="llmapi",
    model="llama3.3-70b",
    temperature=0.0,
    max_tokens=150,
)

# pipelineA = ModelPipelineFactory.create_pipeline(
#     source="llmapi",                 # ← was "api"
#     model="llama3.1-8b",           # any model name LLM API hosts
#     temperature=0.0,
#     max_tokens=200,
# )

# pipelineB = ModelPipelineFactory.create_pipeline(
#     source="llmapi",
#     model="llama3.1-8b",
#     temperature=0.0,
#     max_tokens=200,
# )

# pipelineJ = ModelPipelineFactory.create_pipeline(
#     source="llmapi",
#     model="llama3.1-8b",
#     temperature=0.0,
#     max_tokens=50,
# )

debaterA = Debater("A", pipelineA, has_context=True, debate_mode="symmetric", debate_type="both", history_mode="full_history")
debaterB = Debater("B", pipelineB, has_context=False, debate_mode="symmetric", debate_type="both", history_mode="full_history")
judge     = Judge(  "Judge", pipelineJ)


# tasks = []
# for item in qa_dataset:
#     base_q   = item["Question"]
#     std_ctx  = item.get("Context", None)            # original context
#     std_ans  = item["StandardAnswer"]               # ground‐truth

#     # 1) standard ICL task (no perturbation)
#     tasks.append({
#         "question":         base_q,
#         "context":          std_ctx,
#         "standard_answer":  std_ans,
#         "offset":           "0",
#         "perturbed_answer": std_ans,                 # same as standard
#                # you can check this downstream
#     })

# 3) Stage 1: batch‐call initial answers for both agents
prompts_A0 = [assemble_initial_prompt(t, debaterA.has_context,  debaterA.name) for t in tasks]
prompts_B0 = [assemble_initial_prompt(t, debaterB.has_context,  debaterB.name) for t in tasks]

answers_A0 = pipelineA.call_model_batch(prompts_A0, batch_size)
answers_B0 = pipelineB.call_model_batch(prompts_B0, batch_size)

for i,t in enumerate(tasks):
    t["debater_A_initial"] = answers_A0[i]
    t["debater_B_initial"] = answers_B0[i]

# 4) Stage 2: for each round, batch both agents’ responses
rounds = 5
for r in range(5, rounds+1):
    # Agent A
    prompts_Ar = [
        assemble_transcript(t, num_rounds=r, with_context=debaterA.has_context,
                            agent=debaterA.name, history_mode=debaterA.history_mode,
                            debate_mode=debaterA.debate_mode, debate_type=debaterA.debate_type,
                            role=None)
        for t in tasks
    ]
    resp_Ar = pipelineA.call_model_batch(prompts_Ar, batch_size)
    for i,t in enumerate(tasks):
        t[f"debater_A_round_{r}"] = resp_Ar[i]

    # Agent B
    prompts_Br = [
        assemble_transcript(t, num_rounds=r, with_context=debaterB.has_context,
                            agent=debaterB.name, history_mode=debaterB.history_mode,
                            debate_mode=debaterB.debate_mode, debate_type=debaterB.debate_type,
                            role=None)
        for t in tasks
    ]
    resp_Br = pipelineB.call_model_batch(prompts_Br, batch_size)
    for i,t in enumerate(tasks):
        t[f"debater_B_round_{r}"] = resp_Br[i]

    # 5) Stage 3: batch‐call judge decisions
    prompts_J = [
        assemble_judge_prompt(t, num_rounds=r, with_context=False, judge_type="symmetric")
        for t in tasks
    ]
    outs_J = pipelineJ.call_model_batch(prompts_J, batch_size)
    for i,t in enumerate(tasks):
        t[f"raw_judge_output_{r}"] = outs_J[i]
        m = re.search(r"final answer:\s*(\d{4})", outs_J[i], re.IGNORECASE)
        t[f"final_year_{r}"] = m.group(1) if m else None


In [None]:
rounds = 5
for r in range(5, rounds+1):
    # Agent A
    prompts_Ar = [
        assemble_transcript(t, num_rounds=r, with_context=debaterA.has_context,
                            agent=debaterA.name, history_mode=debaterA.history_mode,
                            debate_mode=debaterA.debate_mode, debate_type=debaterA.debate_type,
                            role=None)
        for t in tasks
    ]
    resp_Ar = pipelineA.call_model_batch(prompts_Ar, batch_size)
    for i,t in enumerate(tasks):
        t[f"debater_A_round_{r}"] = resp_Ar[i]

    # Agent B
    prompts_Br = [
        assemble_transcript(t, num_rounds=r, with_context=debaterB.has_context,
                            agent=debaterB.name, history_mode=debaterB.history_mode,
                            debate_mode=debaterB.debate_mode, debate_type=debaterB.debate_type,
                            role=None)
        for t in tasks
    ]
    resp_Br = pipelineB.call_model_batch(prompts_Br, batch_size)
    for i,t in enumerate(tasks):
        t[f"debater_B_round_{r}"] = resp_Br[i]

    # 5) Stage 3: batch‐call judge decisions
    prompts_J = [
        assemble_judge_prompt(t, num_rounds=r, with_context=False, judge_type="symmetric")
        for t in tasks
    ]
    outs_J = pipelineJ.call_model_batch(prompts_J, batch_size)
    for i,t in enumerate(tasks):
        t[f"raw_judge_output_{r}"] = outs_J[i]
        m = re.search(r"final answer:\s*(\d{4})", outs_J[i], re.IGNORECASE)
        t[f"final_year_{r}"] = m.group(1) if m else None


In [None]:
batch_size = 150
pipelineJ = ModelPipelineFactory.create_pipeline(
    source="llmapi",
    model="llama3.1-70b",
    temperature=0.0,
    max_tokens=250,
)

rounds = 5
for r in range(1, rounds):
    prompts_J = [
        assemble_judge_prompt(t, num_rounds=r, with_context=False, judge_type="symmetric")
        for t in tasks
    ]
    outs_J = pipelineJ.call_model_batch(prompts_J, batch_size)
    for i,t in enumerate(tasks):
        t[f"raw_judge_output_{r}"] = outs_J[i]
        m = re.search(r"final answer:\s*(\d{4})", outs_J[i], re.IGNORECASE)
        t[f"final_year_{r}"] = m.group(1) if m else None

### Comparing Method

#### llama

In [None]:
pipe = ModelPipelineFactory.create_pipeline("llmapi")
print(pipe.call_model("What year was the first iPhone released? Answer only with a 4-digit year."))

2007


In [None]:
import os, random, json, re, math
#from pipeline import ModelPipelineFactory

# --------------------------------------------------
# 0.  Load the 600-item mixed dataset
# --------------------------------------------------
with open("sc_mad_mixed_600.json") as f:
    tasks = json.load(f)

# --------------------------------------------------
# 1.  Build a few-shot header *with context*
# --------------------------------------------------
FEWSHOT_EX_N = 4
fewshot_ex = random.sample([t for t in tasks if t["context"]], FEWSHOT_EX_N)

example_blocks = []
for ex in fewshot_ex:
    example_blocks.append(
        f"Context: {ex['context']}\n"
        f"Question: {ex['question']}\n"
        f"Answer: {ex['standard_answer']}"
    )

FEWSHOT_HEADER = (
    "You are a QA bot. Answer ONLY with the 4-digit year. "
    "If unsure, give your best guess.\n\n"
    + "\n\n".join(example_blocks)
    + "\n\n"
)

# --------------------------------------------------
# 2.  Build one prompt per item  (context + question)
# --------------------------------------------------
prompts = [
    FEWSHOT_HEADER
    + f"Context: {t['context']}\n"
    + f"Question: {t['question']}\n"
    + "Answer:"
    for t in tasks
]

# --------------------------------------------------
# 3.  Run the model in batches via Mistral 7B
# --------------------------------------------------
pipelineFS = ModelPipelineFactory.create_pipeline(
    source="llmapi",                # was "api"
    model= "llama3.3-70b", #"llama3.1-8b", #"claude-3-7-sonnet-20250219",         # was "gpt-3.5-turbo"
    temperature=0.0,
    max_tokens=50,                     # Sonnet can stream up to 2000 anyway
)
BATCH_SIZE = 150
responses = pipelineFS.call_model_batch(prompts, batch_size=BATCH_SIZE)  # inherits your batching logic :contentReference[oaicite:0]{index=0}

# --------------------------------------------------
# 4.  Parse answers & score
# --------------------------------------------------
YEAR_RE = re.compile(r"\b(\d{4})\b")

for t, resp in zip(tasks, responses):
    m = YEAR_RE.search(resp)
    yr = m.group(1) if m else None
    t["fewshot_ctx_raw"]     = resp.strip()
    t["fewshot_ctx_year"]    = yr
    t["fewshot_ctx_correct"] = (yr == t["standard_answer"])

# --------------------------------------------------
# 5.  Quick metrics
# --------------------------------------------------
std_ok = sum(t["fewshot_ctx_correct"] for t in tasks if t["offset"] == "0")
per_ok = sum(t["fewshot_ctx_correct"] for t in tasks if t["offset"] != "0")
std_tot = sum(t["offset"] == "0" for t in tasks)
per_tot = sum(t["offset"] != "0" for t in tasks)

print(f"Few-shot w/ context  –  Standard:  {std_ok/std_tot:.3%}  "
      f"({std_ok}/{std_tot})")
print(f"Few-shot w/ context  –  Perturbed: {per_ok/per_tot:.3%}  "
      f"({per_ok}/{per_tot})")
print(f"Few-shot w/ context  –  Overall:   "
      f"{(std_ok+per_ok)/len(tasks):.3%}  ({std_ok+per_ok}/{len(tasks)})")


Few-shot w/ context  –  Standard:  100.000%  (300/300)
Few-shot w/ context  –  Perturbed: 10.000%  (30/300)
Few-shot w/ context  –  Overall:   55.000%  (330/600)


In [None]:
"""### self-ask"""



# 2) Build the Self-Ask pipeline on Mistral 7B
pipe_sa = ModelPipelineFactory.create_pipeline(
    source="llmapi",                # was "api"
    model= "llama3.1-405b", #"llama3.1-8b", #"claude-3-7-sonnet-20250219",         # was "gpt-3.5-turbo"
    temperature=0.0,
    max_tokens=120,                     # Sonnet can stream up to 2000 anyway
)

# 3) Build prompts
prompts_sa = [
    SELF_ASK_PROMPT.format(
        CONTEXT=t["context"] or "",
        QUESTION=t["question"]
    )
    for t in tasks
]

# 4) Batch-call the model
batch_size = 150
self_ask_outs = pipe_sa.call_model_batch(prompts_sa, batch_size)

# 5) Parse out “Final answer: YYYY” and compare to ground truth
YEAR_RE = re.compile(r"Final answer:\s*(\d{4})", re.IGNORECASE)

for t, out in zip(tasks, self_ask_outs):
    m = YEAR_RE.search(out)
    year = m.group(1) if m else None
    t["selfask_pred"]    = year
    t["selfask_raw"]     = out.strip()
    t["selfask_correct"] = (year == t["standard_answer"])

# 6) Compute accuracy splits
std = [x for x in tasks if x["offset"] == "0"]
prb = [x for x in tasks if x["offset"] != "0"]

print("Self-Ask EM accuracy (Mistral 7B):")
print(f" • Standard : {sum(x['selfask_correct'] for x in std)/len(std):.1%}")
print(f" • Perturbed: {sum(x['selfask_correct'] for x in prb)/len(prb):.1%}")
print(f" • Overall  : {sum(x['selfask_correct'] for x in tasks)/len(tasks):.1%}")


In [None]:
# %% Cell 5 – RCI Step 1: Initial Answers
prompts_init = [
    DEBATER["rci_initial_prompt"].format(
        CONTEXT=t["context"] or "",
        QUESTION=t["question"]
    )
    for t in tasks
]
outs_init = pipe.call_model_batch(prompts_init, batch_size=BATCH_SIZE)

# store drafts
for t, draft in zip(tasks, outs_init):
    # extract 4-digit if present
    m = re.search(r"\b(\d{4})\b", draft)
    t["rci_draft"] = draft.strip()
    t["rci_draft_year"] = m.group(1) if m else None

# %% Cell 6 – RCI Step 2: Critique
prompts_crit = [
    DEBATER["rci_critique_prompt"].format(DRAFT=t["rci_draft"], CONTEXT=t["context"], QUESTION=t["question"])
    for t in tasks
]
outs_crit = pipe.call_model_batch(prompts_crit, batch_size=BATCH_SIZE)

for t, crit in zip(tasks, outs_crit):
    t["rci_critique"] = crit.strip()

# %% Cell 7 – RCI Step 3: Revision
prompts_rev = [
    DEBATER["rci_revise_prompt"].format(
        DRAFT=t["rci_draft"],
        CRITIQUE=t["rci_critique"],
        CONTEXT=t["context"],
        QUESTION=t["question"],
    )
    for t in tasks
]
outs_rev = pipe.call_model_batch(prompts_rev, batch_size=BATCH_SIZE)

for t, final in zip(tasks, outs_rev):
    m = re.search(r"\b(\d{4})\b", final)
    t["rci_final"] = final.strip()
    t["rci_final_year"] = m.group(1) if m else None
    # exact-match correctness
    t["rci_correct"] = (t["rci_final_year"] == t["standard_answer"])

# %% Cell 8 – Accuracy Summary
std  = [t for t in tasks if t["offset"] == "0"]
pert = [t for t in tasks if t["offset"] != "0"]

def acc(key, subset):
    return 100*sum(1 for t in subset if t[key])/len(subset)

print("RCI Exact-Match Accuracy:")
print(f" • Standard : {acc('rci_correct', std):.1f}% ({len(std)} items)")
print(f" • Perturbed: {acc('rci_correct', pert):.1f}% ({len(pert)} items)")
print(f" • Overall  : {acc('rci_correct', tasks):.1f}% ({len(tasks)} items)")

# 14 min

### Naive

In [None]:
# ── 2.  Prompt assembly helpers ─────────────────────────────────────────────
def make_initial_prompt(task: dict, agent_idx: int) -> str:
    return INITIAL_PROMPT.format(
        AGENT_ID=agent_idx,
        QUESTION=task["question"],
        CONTEXT=task["context"]
    )

def make_round_prompt(task: dict,
                      agent_idx: int,
                      round_idx: int,
                      num_agents: int) -> str:
    # collect peers’ last answers (excluding self)
    peers = []
    for j in range(num_agents):
        if j == agent_idx:
            continue
        key = "agent_{j}_{r}".format(j=j, r=round_idx-1)
        peers.append(f"Agent {j}: {task[key]}")
    peer_block = "\n".join(peers)

    own_key = "agent_{i}_{r}".format(i=agent_idx, r=round_idx-1)
    own_ans = task[own_key]

    return CONSENSUS_PROMPT.format(
        AGENT_ID=agent_idx,
        PEER_ANSWERS=peer_block,
        OWN_ANSWER=own_ans,
        QUESTION=task["question"],
        CONTEXT=task["context"]
    )
# ── 4.  Tiny helper to grab ▢ 1859 ▢ → "1859" ───────────────────────────────
def extract_boxed(text: str) -> str:
    m = re.search(r"▢\s*([\s\S]*?)\s*▢", text)
    return m.group(1).strip() if m else text.strip()

In [None]:
# ------------------------------------------------------------
# 1.  Simple helper that hides pipeline.call_model_batch
# ------------------------------------------------------------
def run_llmapi_batch(prompts: list[str],
                     pipeline: ModelPipelineBase,
                     batch_size: int = 150) -> list[str]:
    """
    Thin wrapper so the rest of the code looks like the old
    `run_anthropic_chunks`.  Returns a *flat* list of responses
    in the same order as `prompts`.
    """
    return pipeline.call_model_batch(prompts, batch_size=batch_size)

# ------------------------------------------------------------
# 2.  The Naive debate driver (LLMAPI edition)
# ------------------------------------------------------------
def run_naive_debate_llmapi(tasks: list[dict],
                            pipeline: ModelPipelineBase,
                            num_agents: int  = 3,
                            max_rounds: int  = 3,
                            batch_size: int  = 150,
                            unanimity_break: bool = True) -> None:
    """
    Mutates each element of `tasks` in-place, adding:
        • agent_{i}_{r}  – every agent’s answer per round
        • group_answer   – final answer after convergence / majority vote
    """

    # ---------- Round 0 : independent answers ----------
    init_prompts, mapping = [], []          # mapping → (task_idx, agent_idx)
    for t_idx, task in enumerate(tasks):
        for i in range(num_agents):
            init_prompts.append(make_initial_prompt(task, i))
            mapping.append((t_idx, i))

    init_responses = run_llmapi_batch(init_prompts, pipeline, batch_size)
    for (t_idx, i), txt in zip(mapping, init_responses):
        tasks[t_idx][f"agent_{i}_0"] = txt

    # ---------- Subsequent consensus rounds ----------
    for r in range(1, max_rounds + 1):
        round_prompts, mapping = [], []
        for t_idx, task in enumerate(tasks):
            for i in range(num_agents):
                round_prompts.append(
                    make_round_prompt(task, i, r, num_agents)
                )
                mapping.append((t_idx, i))

        round_responses = run_llmapi_batch(round_prompts, pipeline, batch_size)
        for (t_idx, i), txt in zip(mapping, round_responses):
            tasks[t_idx][f"agent_{i}_{r}"] = txt

        # ---------- Early stopping on unanimity ----------
        if unanimity_break:
            remaining = []
            for task in tasks:
                latest = [task[f"agent_{i}_{r}"] for i in range(num_agents)]
                yrs    = {extract_boxed(ans) for ans in latest}
                if len(yrs) == 1:
                    task["group_answer"] = yrs.pop()
                else:
                    remaining.append(task)
            if not remaining:
                print(f"✓ All tasks unanimous after round {r}")
                break
            tasks = remaining
        print(f"✓ Finished round {r}")

    # ---------- Majority vote for anything still unresolved ----------
    for task in tasks:
        if "group_answer" in task:
            continue
        final_r = min(r, max_rounds)
        final_answers = [
            extract_boxed(task[f"agent_{i}_{final_r}"])
            for i in range(num_agents)
        ]
        task["group_answer"] = max(set(final_answers), key=final_answers.count)

In [None]:
# ============================================================
# 0.  Build *one* LLMAPI pipeline that every agent will share
# ============================================================
NAIVE_MODEL   = "llama3.3-70b"          # or any other model LLMAPI hosts
NAIVE_TEMP    = 0.7                     # matches the old Anthropic setting
NAIVE_MAXTOK  = 200
NAIVE_BSIZE   = 150                     # how many prompts per API call

llmapi_pipe = ModelPipelineFactory.create_pipeline(
    source      = "llmapi",
    model       = NAIVE_MODEL,
    temperature = NAIVE_TEMP,
    max_tokens  = NAIVE_MAXTOK,
)

DATASET_SIZE   = len(tasks)
NUM_AGENTS     = 3
ROUNDS         = 5

for t in tasks:
    t["debate_model_id"] = NAIVE_MODEL
    t["debate class"]    = "Naive-LLMAPI"

run_naive_debate_llmapi(
    tasks,
    pipeline      = llmapi_pipe,
    num_agents    = NUM_AGENTS,
    max_rounds    = ROUNDS,
    batch_size    = NAIVE_BSIZE,
    unanimity_break = True
)

# Save the results
ts   = datetime.now().strftime("%Y%m%d_%H%M%S")
ofile = f"ND_R{ROUNDS}_A{NUM_AGENTS}_QA{DATASET_SIZE}_{NAIVE_MODEL}_{ts}.json"
with open(ofile, "w", encoding="utf-8") as f:
    json.dump(tasks, f, indent=2, ensure_ascii=False)

print("All done →", ofile)

### Rea

In [None]:
import re
import json
import math
# 1) Instantiate pipelines & agents once
batch_size = 150
SC_THRESHOLD = 0.9
rounds = 5

pipelineD = ModelPipelineFactory.create_pipeline(source="llmapi",  model="llama3.3-70b", temperature=0.0, max_tokens=250)
pipelineC = ModelPipelineFactory.create_pipeline(source="llmapi",  model="llama3.3-70b", temperature=0.0, max_tokens=250)
pipelineJ = ModelPipelineFactory.create_pipeline(source="llmapi",  model="llama3.3-70b", temperature=0.0, max_tokens=50)



def_prompts = [
    R_DEBATER["defender_initial"].format(CONTEXT=t["context"], QUESTION=t["question"])
    for t in tasks
]
def_resps = pipelineD.call_model_batch(def_prompts, batch_size)

crit_prompts = [
    R_DEBATER["decider_initial"].format(CONTEXT=t["context"], QUESTION=t["question"])
    for t in tasks
]
crit_resps = pipelineC.call_model_batch(crit_prompts, batch_size)

arg_re = re.compile(r"<argument>(.*?)</argument>", re.DOTALL)
for t, d_resp, c_resp in zip(tasks, def_resps, crit_resps):
    t["defender_argument_0"] = (arg_re.search(d_resp) or [None, d_resp])[1].strip()
    t["critic_argument_0"]   = (arg_re.search(c_resp) or [None, c_resp])[1].strip()

for r in range(1, rounds + 1):
    defender_prompts = []
    critic_prompts   = []

    for t in tasks:
        # 3.a) Build the transcript of rounds 0…r-1
        rounds_text = ""
        for i in range(0, r):  # include opening (i=0) up to round r-1
            rounds_text += R_Transcript["single_round_template"].format(
                round_number=i,
                defender_argument=t[f"defender_argument_{i}"],
                critic_argument=t[f"critic_argument_{i}"]
            )
        full_history = R_Transcript["debate_transcript_template"].format(rounds=rounds_text)

        defender_prompts.append(
            R_DEBATER["defender_followup_concise"].format(
                full_history=full_history,
                name="Defender",
            )
        )
        critic_prompts.append(
            R_DEBATER["decider_followup"].format(
                full_history=full_history,
                name="Decider",
            )
        )
    # 3.c) Batch-call your pipelines
    new_defs  = pipelineD.call_model_batch(defender_prompts, batch_size)
    new_crits = pipelineC.call_model_batch(critic_prompts,  batch_size)
    # 3.d) Extract and store arguments
    for t, d_resp, c_resp in zip(tasks, new_defs, new_crits):
        t[f"defender_argument_{r}"] = (arg_re.search(d_resp) or [None, d_resp])[1].strip()
        t[f"critic_argument_{r}"]   = (arg_re.search(c_resp) or [None, c_resp])[1].strip()

        # 3.b) Judge for this round r
    judge_prompts = []
    for t in tasks:
        rounds_text = ""
        for i in range(0, r + 1):
            rounds_text += R_Transcript["single_round_template"].format(
                round_number=i,
                defender_argument=t[f"defender_argument_{i}"],
                critic_argument=t[f"critic_argument_{i}"]
            )
        full_history = R_Transcript["debate_transcript_template"].format(rounds=rounds_text)
        judge_prompts.append(
            R_JUDGE["judge_instruction"].format(full_history=full_history)
        )

    judge_outs = pipelineJ.call_model_batch(judge_prompts, batch_size)
    for t, jout in zip(tasks, judge_outs):
        t[f"raw_judge_output_round_{r}"] = jout
        m = re.search(r"final decision:\s*(REASONABLE|UNREASONABLE)", jout, re.IGNORECASE)
        t[f"context_reasonable_round_{r}"] = m.group(1).upper() if m else None

    print(f"Completed round {r}")

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename  = (f"RD_R{ROUNDS}_J_T300_QA600_llama3.3_70b_{timestamp}.json")

with open(filename, "w", encoding="utf-8") as f:
    json.dump(tasks, f, ensure_ascii=False, indent=2)

print(f"All done → {filename}")

All done → RD_R5_J_T300_QA600_llama3.3_70b_20250519_001026.json
