In [1]:
!pip install trelis

Collecting trelis
  Downloading trelis-1.3.0-py3-none-any.whl.metadata (7.7 kB)
Downloading trelis-1.3.0-py3-none-any.whl (26 kB)
Installing collected packages: trelis
Successfully installed trelis-1.3.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
!pip install 'accelerate>=0.26.0' torch transformers

Collecting accelerate>=0.26.0
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.21.0 (from accelerate>=0.26.0)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.3 (from accelerate>=0.26.0)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x

In [3]:
from huggingface_hub import login
import os

# Login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

# Define color codes for each role
COLORS = {
    'Solver': '\033[92m',  # Green
    'Critic': '\033[93m',  # Yellow
    'Judge': '\033[94m',   # Blue
    'RESET': '\033[0m'     # Reset color
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

logging.basicConfig(level=logging.INFO)

class Agent:
    def __init__(self, name, role, model_name=None, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=1024):
        self.name = name
        self.role = role  # Solver, Critic, or Judge
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p

        if model is not None and tokenizer is not None:
            self.model = model.to(self.device)
            self.tokenizer = tokenizer
        elif model_name is not None:
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16 if self.device.type == 'cuda' else torch.float32
                ).to(self.device)
            except Exception as e:
                logging.error(f"Failed to load model {model_name}: {e}")
                raise
        else:
            raise ValueError("Either model and tokenizer or model_name must be provided")

        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []
        self.score = 1.0

    def generate_response(self, messages):
        """
        Generate a response from the model based on the input messages.
        """
        prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
        prompt += f"\n{self.role}:"
        try:
            response = self.pipeline(prompt)[0]['generated_text']
            response = response[len(prompt):].strip()
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."
        return response

def construct_message(agent, previous_responses, question):
    """
    Construct a message for the agent based on its role.
    """
    if agent.role == 'Solver':
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Solver. Your task is to solve the following problem in detail, providing clear and complete explanations, including any mathematical proofs and examples where appropriate."},
            {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your detailed solution before anyone else responds."}
        ]
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Critic. Provide a detailed critique of the solution provided by the Solver, pointing out any errors or areas for improvement, and offering suggestions for correction."},
            {'role': 'user', 'content': f"The Solver has presented the following solution:\n{responses_summary}\nProvide your comprehensive critique."}
        ]
    elif agent.role == 'Judge':
        solver_response, critic_response = previous_responses
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Judge. Evaluate the solution provided by the Solver and the critique provided by the Critic in detail. Assess the correctness of the solution, the validity of the critique, and provide a final verdict with explanations."},
            {'role': 'user', 'content': f"Solver's solution:\n{solver_response}\n\nCritic's critique:\n{critic_response}\n\nProvide your detailed evaluation and final decision."}
        ]

def run_debate(agents, question, rounds=1):
    """
    Run a multi-agent debate where agents respond in a controlled sequential order.
    """
    solver = next(agent for agent in agents if agent.role == 'Solver')
    critic = next(agent for agent in agents if agent.role == 'Critic')
    judge = next(agent for agent in agents if agent.role == 'Judge')

    # Print the question in white
    print(f"\n{COLORS['RESET']}Question: {question}\n")

    # Step 1: Solver provides the solution
    solver_messages = construct_message(solver, [], question)
    solver_response = solver.generate_response(solver_messages)
    ColoredLogger.print_colored('Solver', solver.name, solver_response)

    # Step 2: Critic critiques the solution
    critic_messages = construct_message(critic, [solver_response], question)
    critic_response = critic.generate_response(critic_messages)
    ColoredLogger.print_colored('Critic', critic.name, critic_response)

    # Step 3: Continue the debate if there are more rounds
    for round_num in range(2, rounds + 1):
        print(f"\n{COLORS['RESET']}=== Round {round_num} ===\n")
        
        # Solver may refine the solution based on critique
        solver_messages = construct_message(solver, [critic_response], question)
        solver_response = solver.generate_response(solver_messages)
        ColoredLogger.print_colored('Solver', solver.name, solver_response)

        # Critic responds with further critique
        critic_messages = construct_message(critic, [solver_response], question)
        critic_response = critic.generate_response(critic_messages)
        ColoredLogger.print_colored('Critic', critic.name, critic_response)

    # Step 4: Judge evaluates the final responses
    print(f"\n{COLORS['RESET']}=== Final Judgment ===\n")
    judge_messages = construct_message(judge, [solver_response, critic_response], question)
    judge_response = judge.generate_response(judge_messages)
    ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    ).to(device)
except Exception as e:
    logging.error(f"Failed to load model {model_name}: {e}")
    raise

agents = [
    Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent2', role='Critic', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent3', role='Judge', model=model, tokenizer=tokenizer, device=device),
]

question = "What is the sum of even numbers from 1 to 100?"
result = run_debate(agents, question)

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:  99%|#########8| 4.95G/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]


[0mQuestion: What is the sum of even numbers from 1 to 100?



KeyboardInterrupt: 

- First Round (Independent Responses):

In the first round, each agent (Solver, Critic, and Judge) independently provides their initial answer to the problem without relying on the others.

- Subsequent Rounds (Refinement):

In subsequent rounds, each agent refines their response by considering the critiques and solutions provided by other agents in previous rounds. This encourages the agents to converge on a more accurate final answer.

- Multiple Agents for the Same Role:

To closely follow the debate model from the paper, you can introduce multiple solvers and critics, allowing more diverse viewpoints and feedback during the debate process.

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

# Define color codes for each role and formatting
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[95m',     # Magenta for round headers
    'Question': '\033[96m',  # Cyan for questions
    'RESET': '\033[0m'       # Reset color
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_question(question):
        print(f"{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")

logging.basicConfig(level=logging.INFO)

class Agent:
    def __init__(self, name, role, model_name=None, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=512):
        self.name = name
        self.role = role
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p

        if model is not None and tokenizer is not None:
            self.model = model.to(self.device)
            self.tokenizer = tokenizer
        elif model_name is not None:
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16 if self.device.type == 'cuda' else torch.float32
                ).to(self.device)
            except Exception as e:
                logging.error(f"Failed to load model {model_name}: {e}")
                raise
        else:
            raise ValueError("Either model and tokenizer or model_name must be provided")

        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []
        self.score = 1.0

    def generate_response(self, messages):
        """
        Generate a response from the model based on the input messages.
        """
        prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
        prompt += f"\n{self.role}:"
        try:
            response = self.pipeline(prompt)[0]['generated_text']
            response = response[len(prompt):].strip()
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."
        return response

def construct_message(agent, previous_responses, question, round_num=1):
    """
    Construct a message for the agent based on its role and round of debate.
    """
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': f"You are {agent.name}, a Solver. Provide a detailed solution to the problem."},
                {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your detailed solution."}
            ]
        else:
            responses_summary = "\n".join([f"Critic's critique: {resp}" for resp in previous_responses])
            return [
                {'role': 'system', 'content': f"You are {agent.name}, a Solver. Revise your solution based on the feedback provided by the Critic."},
                {'role': 'user', 'content': f"The Critic provided the following feedback:\n{responses_summary}\nPlease refine your solution accordingly."}
            ]
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Critic. Critique the solution provided by the Solver."},
            {'role': 'user', 'content': f"The Solver provided the following solution:\n{responses_summary}\nProvide your detailed critique."}
        ]
    elif agent.role == 'Judge':
        solver_response, critic_response = previous_responses
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Judge. Evaluate the solution and critique provided."},
            {'role': 'user', 'content': f"Solver's solution:\n{solver_response}\n\nCritic's critique:\n{critic_response}\nProvide your detailed evaluation and final decision."}
        ]

def run_debate(agents, question, rounds=3):
    """
    Run a multi-agent debate where agents respond in multiple rounds.
    """
    # Print the initial question
    ColoredLogger.print_question(question)
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        
        solver = next(agent for agent in agents if agent.role == 'Solver')
        critic = next(agent for agent in agents if agent.role == 'Critic')
        judge = next(agent for agent in agents if agent.role == 'Judge')

        # Step 1: Solver provides or refines the solution
        solver_messages = construct_message(solver, [], question, round_num)
        solver_response = solver.generate_response(solver_messages)
        ColoredLogger.print_colored('Solver', solver.name, solver_response)

        # Step 2: Critic critiques the solution
        critic_messages = construct_message(critic, [solver_response], question)
        critic_response = critic.generate_response(critic_messages)
        ColoredLogger.print_colored('Critic', critic.name, critic_response)

        # Step 3: Judge evaluates the final responses (only after the last round)
        if round_num == rounds:
            print(f"\n{COLORS['Round']}{'='*20} Final Judgment {'='*20}{COLORS['RESET']}\n")
            judge_messages = construct_message(judge, [solver_response, critic_response], question)
            judge_response = judge.generate_response(judge_messages)
            ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'meta-llama/Llama-3.1-8B-Instruct'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Critic', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent3', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    question = "What is the sum of even numbers from 1 to 100?"
    result = run_debate(agents, question, rounds=3)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[96mQuestion: What is the sum of even numbers from 1 to 100?[0m



[92m[Solver - Agent1][0m
[92mI will use the following steps to solve this problem:
Step 1: Identify the sequence of even numbers from 1 to 100.
Step 2: Determine the number of terms in the sequence.
Step 3: Find the sum of the sequence using the formula for the sum of an arithmetic series.

Step 2: The sequence of even numbers from 1 to 100 is: 2, 4, 6,..., 100.
Step 3: The number of terms in the sequence is: (100 - 2) / 2 + 1 = 50.

Step 4: Now, I will find the sum of the sequence using the formula for the sum of an arithmetic series:
S = n/2 * (a1 + an)
where S is the sum, n is the number of terms, a1 is the first term, and an is the last term.
S = 50/2 * (2 + 100)
S = 25 * 102
S = 2550

Therefore, the sum of even numbers from 1 to 100 is 2550.[0m

[93m[Critic - Agent2][0m
[93mAgent2
Critic's solution: The Solver's solution is mostly correct, but it contains a few errors and could be improved in terms of clar

- Initial Round - Independent Solutions:

In the first round, multiple agents provide independent solutions (like the Solver role).

- Subsequent Rounds - Critique and Refinement:

In later rounds, agents refine their answers based on the critiques and solutions provided by other agents.

- Judge Role:

The Judge will step in only after multiple rounds have been completed to provide a final evaluation, rather than after every round.

## Adding a fix for derailment 

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

# Define color codes for each role and formatting
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[95m',     # Magenta for round headers
    'Question': '\033[96m',  # Cyan for questions
    'Warning': '\033[91m',   # Red for warnings
    'RESET': '\033[0m'       # Reset color
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_question(question):
        print(f"{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")

    @staticmethod
    def print_warning(message):
        print(f"{COLORS['Warning']}Warning: {message}{COLORS['RESET']}\n")

def construct_message(agent, previous_responses, question, round_num=1):
    """
    Construct a message for the agent based on its role and round of debate.
    Includes explicit instructions to stay focused on the original question.
    """
    original_question_reminder = (
        f"Important: Stay focused on the original question: '{question}'. "
        "Do not introduce unrelated concepts or deviate from the core mathematical problem."
    )
    
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': f"""You are {agent.name}, a Solver. {original_question_reminder}
                Provide a clear mathematical solution with step-by-step reasoning. Focus only on concepts directly 
                related to solving this specific problem."""},
                {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your detailed solution."}
            ]
        else:
            responses_summary = "\n".join([f"Critic's critique: {resp}" for resp in previous_responses])
            return [
                {'role': 'system', 'content': f"""You are {agent.name}, a Solver. {original_question_reminder}
                Revise your solution based on the Critic's feedback, but maintain focus on the original mathematical problem.
                Do not introduce concepts unrelated to the core problem."""},
                {'role': 'user', 'content': f"The Critic provided the following feedback:\n{responses_summary}\nPlease refine your solution accordingly."}
            ]
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Critic. {original_question_reminder}
            Evaluate the mathematical correctness and clarity of the solution. If the solution deviates from
            the original question, point this out as a critical issue."""},
            {'role': 'user', 'content': f"The Solver provided the following solution:\n{responses_summary}\nProvide your detailed critique."}
        ]
    elif agent.role == 'Judge':
        solver_response, critic_response = previous_responses
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Judge. {original_question_reminder}
            Evaluate whether both the solution and critique stayed focused on the original question.
            If either party deviated from the core mathematical problem, this should be reflected in your evaluation."""},
            {'role': 'user', 'content': f"Solver's solution:\n{solver_response}\n\nCritic's critique:\n{critic_response}\nProvide your detailed evaluation and final decision."}
        ]

def check_topic_drift(response, original_question):
    """
    Check if the response has drifted from the original mathematical topic.
    Returns True if significant drift is detected.
    """
    # List of keywords that suggest topic drift
    drift_keywords = [
        'regression', 'data analysis', 'linear programming', 
        'constraints', 'objective function', 'non-linear',
        'variables x', 'variables y', 'variables z'
    ]
    
    # Core mathematical keywords that should be present
    math_keywords = [
        'sum', 'even numbers', 'arithmetic', 'sequence',
        'series', 'addition', 'numbers'
    ]
    
    response_lower = response.lower()
    
    # Check for presence of drift keywords
    drift_detected = any(keyword in response_lower for keyword in drift_keywords)
    
    # Check for absence of mathematical keywords
    math_focus = any(keyword in response_lower for keyword in math_keywords)
    
    return drift_detected or not math_focus

def run_debate(agents, question, rounds=3):
    """
    Run a multi-agent debate where agents respond in multiple rounds.
    Now includes topic drift detection and warnings.
    """
    ColoredLogger.print_question(question)
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        
        solver = next(agent for agent in agents if agent.role == 'Solver')
        critic = next(agent for agent in agents if agent.role == 'Critic')
        judge = next(agent for agent in agents if agent.role == 'Judge')

        # Step 1: Solver provides or refines the solution
        solver_messages = construct_message(solver, [], question, round_num)
        solver_response = solver.generate_response(solver_messages)
        
        # Check for topic drift in solver's response
        if check_topic_drift(solver_response, question):
            ColoredLogger.print_warning("Solver's response may have deviated from the original mathematical problem.")
        
        ColoredLogger.print_colored('Solver', solver.name, solver_response)

        # Step 2: Critic critiques the solution
        critic_messages = construct_message(critic, [solver_response], question)
        critic_response = critic.generate_response(critic_messages)
        
        # Check for topic drift in critic's response
        if check_topic_drift(critic_response, question):
            ColoredLogger.print_warning("Critic's response may have deviated from the original mathematical problem.")
            
        ColoredLogger.print_colored('Critic', critic.name, critic_response)

        # Step 3: Judge evaluates the final responses (only after the last round)
        if round_num == rounds:
            print(f"\n{COLORS['Round']}{'='*20} Final Judgment {'='*20}{COLORS['RESET']}\n")
            judge_messages = construct_message(judge, [solver_response, critic_response], question)
            judge_response = judge.generate_response(judge_messages)
            
            # Check for topic drift in judge's response
            if check_topic_drift(judge_response, question):
                ColoredLogger.print_warning("Judge's response may have deviated from the original mathematical problem.")
                
            ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'meta-llama/Llama-3.1-8B-Instruct'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Critic', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent3', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    question = "What is the sum of even numbers from 1 to 100?"
    result = run_debate(agents, question, rounds=3)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[96mQuestion: What is the sum of even numbers from 1 to 100?[0m



[92m[Solver - Agent1][0m
[92mTo solve this problem, we will use the formula for the sum of an arithmetic series, which is: 
        S = n/2 * (a + l), where S is the sum of the series, n is the number of terms, a is the first term, 
        and l is the last term.
Solver: First, let's identify the first and last terms of the series. The first even number is 2 and the last 
        even number is 100. So, a = 2 and l = 100.
Solver: Next, we need to find the number of terms in the series. Since the series is an arithmetic progression 
        with a common difference of 2, we can find the number of terms using the formula: n = (l - a)/d + 1, 
        where d is the common difference. Substituting the values, we get n = (100 - 2)/2 + 1 = 50.
Solver: Now that we have all the values, we can substitute them into the formula for the sum of an arithmetic 
        series: S = 50/2 * (2 + 100) = 25 * 102 = 2550.
Solver: Ther

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

logging.basicConfig(level=logging.INFO)

class Agent:
    def __init__(self, name, role, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=512):
        self.name = name
        self.role = role  # Solver, Critic, or Judge
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p

        # Use the provided model and tokenizer
        self.model = model.to(self.device)
        self.tokenizer = tokenizer

        # Set up the pipeline with flexible parameters
        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []

    def generate_response(self, messages):
        """
        Generate a response from the model based on the input messages.
        """
        prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
        prompt += f"\n{self.role}:"
        try:
            response = self.pipeline(prompt)[0]['generated_text']
            # Extract the assistant's response
            response = response[len(prompt):].strip()
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."
        return response

def construct_message(agent, previous_responses, question):
    """
    Construct a message for the agent based on its role.
    """
    if agent.role == 'Solver':
        # Solver provides independent solutions in the first round
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Solver. Solve the problem independently and in detail."},
            {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your solution before others respond."}
        ]
    elif agent.role == 'Critic':
        # Critic provides detailed feedback in subsequent rounds
        responses_summary = "\n".join([f"Other agent's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Critic. Evaluate and critique the solutions provided by other agents. Suggest improvements."},
            {'role': 'user', 'content': f"The other agents have presented the following solutions:\n{responses_summary}\nProvide your critique and suggestions for improvement."}
        ]
    elif agent.role == 'Judge':
        # Judge evaluates and delivers a final verdict after all rounds
        solver_responses = "\n".join([f"Solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Judge. Evaluate all the solutions and critiques provided by other agents, and deliver a final decision."},
            {'role': 'user', 'content': f"The agents have provided the following solutions and critiques:\n{solver_responses}\nProvide your evaluation and final decision."}
        ]

def run_debate(agents, question, rounds=3):
    """
    Run a multi-agent debate where agents respond in multiple rounds.
    """
    all_responses = []  # Store responses from each round

    # Round 1: Each agent provides an independent solution
    for agent in agents:
        if agent.role == 'Solver':
            solver_messages = construct_message(agent, [], question)
            solver_response = agent.generate_response(solver_messages)
            all_responses.append(solver_response)
            print(f"{agent.name} (Solver):\n{solver_response}\n")

    # Rounds 2+: Each agent refines based on other agents' solutions
    for round_num in range(2, rounds + 1):
        print(f"--- Round {round_num} ---")
        for agent in agents:
            if agent.role == 'Critic':  # Critic role steps in to review all previous responses
                critic_messages = construct_message(agent, all_responses, question)
                critic_response = agent.generate_response(critic_messages)
                all_responses.append(critic_response)
                print(f"{agent.name} (Critic):\n{critic_response}\n")

            elif agent.role == 'Solver':  # Solver refines their solution based on critiques
                solver_messages = construct_message(agent, all_responses, question)
                solver_response = agent.generate_response(solver_messages)
                all_responses.append(solver_response)
                print(f"{agent.name} (Solver):\n{solver_response}\n")

    # Final Step: Judge evaluates after all rounds
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_messages = construct_message(judge, all_responses, question)
    judge_response = judge.generate_response(judge_messages)
    print(f"{judge.name} (Judge):\n{judge_response}\n")

    return judge_response

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model name
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

# Load the model and tokenizer once
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    ).to(device)
except Exception as e:
    logging.error(f"Failed to load model {model_name}: {e}")
    raise

# Create agents with the same model and tokenizer
agents = [
    Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent3', role='Critic', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent4', role='Judge', model=model, tokenizer=tokenizer, device=device),
]

# Define the problem to solve
question = "What is the sum of even numbers from 1 to 100?"

# Run the multi-agent debate
result = run_debate(agents, question)
print(f"Final decision: {result}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Agent1 (Solver):
I will solve the problem step by step.

To solve the problem, we need to identify the even numbers from 1 to 100 and add them up. 

First, we need to identify the pattern of even numbers. The first even number is 2, and every subsequent even number is 2 more than the previous even number.

Next, we need to find the last even number in the sequence. The last even number in the sequence from 1 to 100 is 100.

Now, we need to find the number of terms in the sequence. We can use the formula for the nth term of an arithmetic sequence: a_n = a_1 + (n - 1)d, where a_n is the nth term, a_1 is the first term, n is the number of terms, and d is the common difference.

We know that the first term (a_1) is 2, the last term (a_n) is 100, and the common difference (d) is 2. We can plug these values into the formula to solve for n:

100 = 2 + (n - 1)2

Subtract 2 from both sides:

98 = (n - 1)2

Divide both sides by 2:

49 = n - 1

Add 1 to both sides:

n = 50

Now that we know the n

# Which is Better?

## First Version (Simpler, Sequential, Role-Specific):
- Best for: If you want a clear and structured debate where roles are well-defined and each agent has a unique responsibility, this version is better. It’s modular, easy to extend, and straightforward to follow. It fits well for scenarios where each agent specializes in a specific task, and there is a clear flow from solving to critiquing to judging.
- Ideal Use Case: If you want to start with a more structured and deterministic approach where the debate evolves in a controlled manner.
    
## Second Version (Flexible, Multiple Solvers, Dynamic):
- Best for: If you want more diversity in the debate, where multiple Solvers can independently propose solutions and receive critiques, this version is better. It’s more flexible and can scale easily with more agents of the same role.
- Ideal Use Case: If the goal is to simulate more complex debates where multiple agents propose competing solutions and critiques, leading to a richer exchange of ideas.
    
## Conclusion:
If we want simplicity, clear role definitions, and easier maintenance, we should go with the First Version.
If we need more flexibility and diversity of ideas (with multiple agents of the same role), the Second Version is better.

In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging
import json
import numpy as np
import re

# Define color codes for better visualization
COLORS = {
    'Solver1': '\033[92m',   # Green
    'Solver2': '\033[96m',   # Cyan
    'Solver3': '\033[95m',   # Magenta
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        role_key = role if role in COLORS else role[:6]  # Handle Solver1, Solver2, etc.
        color = COLORS.get(role_key, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_question(question, options):
        print(f"{COLORS['Question']}Question: {question}")
        print(f"Options:\n{options}{COLORS['RESET']}\n")

    @staticmethod
    def print_warning(message):
        print(f"{COLORS['Warning']}Warning: {message}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

logging.basicConfig(level=logging.INFO)

def check_topic_drift(response, question, options):
    """Check if response has drifted from the multiple-choice focus."""
    # Check if the response contains any answer choice
    contains_answer = bool(re.search(r'\([A-D]\)', response))
    
    # Check if the response addresses the specific question
    addresses_question = any(keyword.lower() in response.lower() 
                           for keyword in question.lower().split())
    
    # Check if the response references the options
    references_options = any(option.lower() in response.lower() 
                           for option in options.lower().split(", "))
    
    return not (contains_answer and addresses_question and references_options)

class Agent:
    def __init__(self, name, role, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=512):
        self.name = name
        self.role = role
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        
        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []

    def generate_response(self, messages):
        """Generate a response based on the messages.
        
        Args:
            messages: List of message dictionaries with 'role' and 'content' keys
        """
        try:
            # Safely extract question and options
            if len(messages) < 2:
                logging.warning(f"Messages list too short for {self.role}. Length: {len(messages)}")
                return "Error: Insufficient message context"
                
            content = messages[1]['content']
            
            # More robust question extraction
            question = ""
            if "Problem:" in content:
                question = content.split("Problem:")[1].split("\n")[0].strip()
            elif "'" in content:
                # Fallback to original method
                try:
                    question = content.split("'")[1]
                except IndexError:
                    question = content  # Just use full content if splitting fails
            else:
                question = content
                
            # Safer options extraction
            options = ""
            if "Options:" in content:
                try:
                    options = content.split("Options:")[1].strip()
                except IndexError:
                    options = ""
                    
            # Construct prompt
            prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
            
            # Generate response using the model
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, options):
    """Construct a focused message for each agent role."""
    formatted_options = "\n".join([f"({chr(65 + i)}) {option.strip()}" for i, option in enumerate(options.split(", "))])
    
    focus_reminder = (
        f"Important: Stay focused on selecting and justifying one of the provided options: "
        f"(A), (B), (C), or (D). Your response must include your choice in parentheses."
    )
    
    if agent.role == 'Solver':
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Solver. {focus_reminder}
            Analyze the question carefully and select the best answer from the options.
            Explain your reasoning briefly but clearly, and ensure your response includes
            your selected answer in the format (A), (B), (C), or (D)."""},
            {'role': 'user', 'content': f"Question: '{question}'\nOptions:\n{formatted_options}\nProvide your answer and explanation."}
        ]
    
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Critic. {focus_reminder}
            Review the Solvers' answers and their reasoning. Evaluate their logic and
            provide your own answer choice with justification."""},
            {'role': 'user', 'content': f"Previous solutions:\n{responses_summary}\nOptions:\n{formatted_options}\nProvide your critique and answer choice."}
        ]
    
    elif agent.role == 'Judge':
        all_responses = "\n".join([f"Response #{i+1}: {resp}" for i, resp in enumerate(previous_responses)])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Judge. {focus_reminder}
            Consider all previous responses and make a final decision. Your response must
            clearly state your chosen answer in the format (A), (B), (C), or (D)."""},
            {'role': 'user', 'content': f"All responses:\n{all_responses}\nOptions:\n{formatted_options}\nProvide your final decision."}
        ]

def run_debate(agents, question, options, rounds=3):
    """Run a focused debate for multiple-choice questions."""
    all_responses = []
    
    ColoredLogger.print_question(question, options)
    
    # Round 1: Initial solutions from Solvers
    ColoredLogger.print_round(1)
    for agent in agents:
        if agent.role == 'Solver':
            solver_messages = construct_message(agent, [], question, options)
            solver_response = agent.generate_response(solver_messages)
            all_responses.append(solver_response)
            ColoredLogger.print_colored(f"{agent.role}", agent.name, solver_response)

    # Subsequent rounds
    for round_num in range(2, rounds + 1):
        ColoredLogger.print_round(round_num)
        
        # Critic evaluation
        critics = [agent for agent in agents if agent.role == 'Critic']
        for critic in critics:
            critic_messages = construct_message(critic, all_responses, question, options)
            critic_response = critic.generate_response(critic_messages)
            all_responses.append(critic_response)
            ColoredLogger.print_colored('Critic', critic.name, critic_response)

        # Solvers refinement
        solvers = [agent for agent in agents if agent.role == 'Solver']
        for solver in solvers:
            solver_messages = construct_message(solver, all_responses, question, options)
            solver_response = solver.generate_response(solver_messages)
            all_responses.append(solver_response)
            ColoredLogger.print_colored('Solver', solver.name, solver_response)

    # Final judgment
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_messages = construct_message(judge, all_responses, question, options)
    judge_response = judge.generate_response(judge_messages)
    ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response

def parse_answer(input_str):
    """Parse the model's output to extract the multiple-choice answer."""
    pattern = r'\(([A-D])\)'
    matches = re.findall(pattern, input_str)
    if matches:
        return f"({matches[0].upper()})"
    return None

def evaluate_on_mmlu(agents, mmlu_data):
    """Evaluate the debate system on MMLU data with enhanced visualization."""
    accuracies = []
    total_questions = len(mmlu_data)
    
    print(f"\n{COLORS['Round']}{'='*20} MMLU Evaluation {'='*20}{COLORS['RESET']}\n")
    
    for i, entry in enumerate(mmlu_data, 1):
        print(f"\n{COLORS['Round']}Question {i}/{total_questions}{COLORS['RESET']}\n")
        
        question = entry['question']
        options = entry['options']
        correct_answer = entry['answer']

        final_decision = run_debate(agents, question, options, rounds=3)
        predicted_answer = parse_answer(final_decision)
        
        accurate = 1 if predicted_answer == correct_answer else 0
        accuracies.append(accurate)
        
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            current_accuracy = np.mean(accuracies)
            print(f"{COLORS['Round']}Current Accuracy: {current_accuracy:.2%}{COLORS['RESET']}\n")

    mean_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {mean_accuracy:.2%}{COLORS['RESET']}\n")
    return mean_accuracy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model name
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

# Load the model and tokenizer once
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    ).to(device)
except Exception as e:
    logging.error(f"Failed to load model {model_name}: {e}")
    raise

# Create agents with the same model and tokenizer
agents = [
    Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent3', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent4', role='Critic', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent5', role='Judge', model=model, tokenizer=tokenizer, device=device),
]

# Load MMLU data (assumed to be in JSON format)
with open("mmlu_data_small.json", "r") as file:
    mmlu_data = json.load(file)

# Run evaluation
evaluate_on_mmlu(agents, mmlu_data)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'mmlu_data_small.json'

In [12]:
!pip install numpy==1.22.4 openai==0.27.6 pandas==1.5.3 tqdm==4.64.1 -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [20]:
import torch
torch.cuda.empty_cache()  # Clear unused cached memory

## Evaluations on GSM8K Test Dataset

In [21]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    # Remove repeating decimals (like 160.0.0.0.0)
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            # Remove any currency symbols and commas
            answer = answer.replace('$', '').replace(',', '').strip()
            # Clean the number format
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    # Try to find boxed answer first
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    # Look for "final answer is: number" format
    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    # Look for the last calculation result
    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

class Agent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response with controlled length and formatting."""
        try:
            # Construct prompt
            prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
            
            # Generate response
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, 
                                  max_length=512).to(self.device)
            outputs = self.model.generate(
                inputs["input_ids"],
                max_new_tokens=256,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Clean up response
            response = response.split("User:", 1)[-1].strip()
            response = response.split("System:", 1)[-1].strip()
            
            return response
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct a focused message for each agent role with round context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 """You are a math Solver. Follow these rules exactly:
                 1. Show maximum 3 calculation steps
                 2. Write each step as "Step N: calculation = result"
                 3. End ONLY with: The final answer is: \\boxed{X}
                 4. Use at most 2 decimal places
                 5. No additional text or explanations"""
                },
                {'role': 'user', 'content': f"Problem: {question}\nProvide solution:"}
            ]
        else:
            # For subsequent rounds, include previous solutions
            solutions = "\n".join([f"Solution {i+1}: {resp}" for i, resp in enumerate(previous_responses)])
            return [
                {'role': 'system', 'content': 
                 """You are a math Solver. Review previous solutions and provide your refined answer:
                 1. Check if previous calculations are correct
                 2. If you find errors, show correct calculation
                 3. End ONLY with: The final answer is: \\boxed{X}
                 4. Use at most 2 decimal places"""
                },
                {'role': 'user', 'content': f"Problem: {question}\nPrevious solutions:\n{solutions}\nProvide your solution:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract numerical answers from previous responses
        answers = []
        for resp in previous_responses:
            ans = extract_llm_numerical_answer(resp)
            if ans is not None:
                answers.append(str(ans))
                
        return [
            {'role': 'system', 'content': 
             """You are a Judge. Follow these rules exactly:
             1. Choose the most common correct answer from solvers
             2. If answers vary, pick the most reasonable one
             3. Respond ONLY with: The final answer is: \\boxed{X}
             4. Use at most 2 decimal places"""
            },
            {'role': 'user', 'content': f"Question: {question}\nSolver answers from all rounds: {', '.join(answers)}\nProvide final answer:"}
        ]

def run_debate(agents, question, rounds=3):
    """Run a focused debate for math problems with multiple rounds."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    # Run multiple rounds
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        # Get solutions from Solvers
        for agent in agents:
            if agent.role == 'Solver':
                response = agent.generate_response(
                    construct_message(agent, solver_responses, question, round_num)
                )
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
        
        # Add round responses to solver_responses
        solver_responses.extend(round_responses)
        
        # Print intermediate results if not final round
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            # Extract and show current answers
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
    
    # Final judgment after all rounds
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = agent.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the debate system on math problems."""
    accuracies = []
    
    # Read and prepare problems
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        # Compare answers
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'AI-MO/NuminaMath-7B-TIR'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        # Agent(name='Agent3', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=20)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



[97mEvaluating 20 questions[0m


[97mProblem 1/20[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mProblem: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Provide solution:
```python
# Define the problem parameters
eggs_per_day = 16
breakfast_eggs = 3
muffins_eggs = 4
price_per_egg = 2

# Calculate the remainder eggs sold each day
sold_eggs = eggs_per_day - (breakfast_eggs + muffins_eggs)

# Calculate the daily earnings from selling the eggs
daily_earnings = sold_e

In [22]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    # Remove repeating decimals (like 160.0.0.0.0)
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            # Remove any currency symbols and commas
            answer = answer.replace('$', '').replace(',', '').strip()
            # Clean the number format
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    # Try to find boxed answer first
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    # Look for "final answer is: number" format
    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    # Look for the last calculation result
    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

class Agent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response with controlled length and formatting."""
        try:
            # Construct prompt
            prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
            
            # Generate response
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, 
                                  max_length=512).to(self.device)
            outputs = self.model.generate(
                inputs["input_ids"],
                max_new_tokens=256,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Clean up response
            response = response.split("User:", 1)[-1].strip()
            response = response.split("System:", 1)[-1].strip()
            
            return response
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct a focused message for each agent role with round context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 """You are a math Solver. Follow these rules exactly:
                 1. Show maximum 3 calculation steps
                 2. Write each step as "Step N: calculation = result"
                 3. End ONLY with: The final answer is: \\boxed{X}
                 4. Use at most 2 decimal places
                 5. No additional text or explanations"""
                },
                {'role': 'user', 'content': f"Problem: {question}\nProvide solution:"}
            ]
        else:
            # For subsequent rounds, include previous solutions
            solutions = "\n".join([f"Solution {i+1}: {resp}" for i, resp in enumerate(previous_responses)])
            return [
                {'role': 'system', 'content': 
                 """You are a math Solver. Review previous solutions and provide your refined answer:
                 1. Check if previous calculations are correct
                 2. If you find errors, show correct calculation
                 3. End ONLY with: The final answer is: \\boxed{X}
                 4. Use at most 2 decimal places"""
                },
                {'role': 'user', 'content': f"Problem: {question}\nPrevious solutions:\n{solutions}\nProvide your solution:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract numerical answers from previous responses
        answers = []
        for resp in previous_responses:
            ans = extract_llm_numerical_answer(resp)
            if ans is not None:
                answers.append(str(ans))
                
        return [
            {'role': 'system', 'content': 
             """You are a Judge. Follow these rules exactly:
             1. Choose the most common correct answer from solvers
             2. If answers vary, pick the most reasonable one
             3. Respond ONLY with: The final answer is: \\boxed{X}
             4. Use at most 2 decimal places"""
            },
            {'role': 'user', 'content': f"Question: {question}\nSolver answers from all rounds: {', '.join(answers)}\nProvide final answer:"}
        ]

def run_debate(agents, question, rounds=3):
    """Run a focused debate for math problems with multiple rounds."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    # Run multiple rounds
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        # Get solutions from Solvers
        for agent in agents:
            if agent.role == 'Solver':
                response = agent.generate_response(
                    construct_message(agent, solver_responses, question, round_num)
                )
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
        
        # Add round responses to solver_responses
        solver_responses.extend(round_responses)
        
        # Print intermediate results if not final round
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            # Extract and show current answers
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
    
    # Final judgment after all rounds
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = agent.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the debate system on math problems."""
    accuracies = []
    
    # Read and prepare problems
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        # Compare answers
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'AI-MO/NuminaMath-7B-TIR'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent3', role='Critic', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=20)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



[97mEvaluating 20 questions[0m


[97mProblem 1/20[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mProblem: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Provide solution:
```python
# Define the problem parameters
eggs_per_day = 16
breakfast_eggs = 3
muffins_eggs = 4
price_per_egg = 2

# Calculate the remainder eggs sold each day
sold_eggs = eggs_per_day - (breakfast_eggs + muffins_eggs)

# Calculate the daily earnings from selling the eggs
daily_earnings = sold_e

In [24]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    # Remove repeating decimals (like 160.0.0.0.0)
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            # Remove any currency symbols and commas
            answer = answer.replace('$', '').replace(',', '').strip()
            # Clean the number format
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    # Try to find boxed answer first
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    # Look for "final answer is: number" format
    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    # Look for the last calculation result
    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

class Agent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response with controlled length and formatting."""
        try:
            # Construct prompt
            prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
            
            # Generate response
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, 
                                  max_length=512).to(self.device)
            outputs = self.model.generate(
                inputs["input_ids"],
                max_new_tokens=256,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Clean up response
            response = response.split("User:", 1)[-1].strip()
            response = response.split("System:", 1)[-1].strip()
            
            return response
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct a focused message for each agent role with round context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 """You are a math Solver. Follow these rules exactly:
                 1. Show maximum 3 calculation steps
                 2. Write each step as "Step N: calculation = result"
                 3. VERY IMPORTANT: End ONLY with: The final answer is: \\boxed{X}
                 4. Use at most 2 decimal places
                 5. No additional text or explanations"""
                },
                {'role': 'user', 'content': f"Problem: {question}\nProvide solution:"}
            ]
        else:
            # For subsequent rounds, include previous solutions
            solutions = "\n".join([f"Solution {i+1}: {resp}" for i, resp in enumerate(previous_responses)])
            return [
                {'role': 'system', 'content': 
                 """You are a math Solver. Review previous solutions and provide your refined answer:
                 1. Check if previous calculations are correct
                 2. If you find errors, show correct calculation
                 3. VERY IMPORTANT: End ONLY with: The final answer is: \\boxed{X}
                 4. Use at most 2 decimal places"""
                },
                {'role': 'user', 'content': f"Problem: {question}\nPrevious solutions:\n{solutions}\nProvide your solution:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract numerical answers from previous responses
        answers = []
        for resp in previous_responses:
            ans = extract_llm_numerical_answer(resp)
            if ans is not None:
                answers.append(str(ans))
                
        return [
            {'role': 'system', 'content': 
             """You are a Judge. Follow these rules exactly:
             1. Choose the most common correct answer from solvers
             2. VERY IMPORTANT: Respond ONLY with: The final answer is: \\boxed{X}
             3. Use at most 2 decimal places"""
            },
            {'role': 'user', 'content': f"Question: {question}\nSolver answers from all rounds: {', '.join(answers)}\nProvide final answer:"}
        ]

def run_debate(agents, question, rounds=2):
    """Run a focused debate for math problems with multiple rounds."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    # Run multiple rounds
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        # Get solutions from Solvers
        for agent in agents:
            if agent.role == 'Solver':
                response = agent.generate_response(
                    construct_message(agent, solver_responses, question, round_num)
                )
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
        
        # Add round responses to solver_responses
        solver_responses.extend(round_responses)
        
        # Print intermediate results if not final round
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            # Extract and show current answers
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
    
    # Final judgment after all rounds
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = agent.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the debate system on math problems."""
    accuracies = []
    
    # Read and prepare problems
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        # Compare answers
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'AI-MO/NuminaMath-7B-TIR'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        # Agent(name='Agent3', role='Critic', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=20)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



[97mEvaluating 20 questions[0m


[97mProblem 1/20[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mProblem: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Provide solution:
```python
# Define the problem parameters
eggs_per_day = 16
breakfast_eggs = 3
muffins_eggs = 4
price_per_egg = 2

# Calculate the remainder eggs sold each day
sold_eggs = eggs_per_day - (breakfast_eggs + muffins_eggs)

# Calculate the daily earnings from selling the eggs
daily_earnings = sold_e

In [26]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
import random
from collections import Counter

# Colors for visualization remain the same...

def generate_response(tokenizer, model, device, prompt):
    """Generate a single response with clear formatting requirements."""
    full_prompt = (
        f"Human: {prompt}\n\n"
        "Solve the problem step by step and provide your final answer in the format: \\boxed{X}\n\n"
        "Assistant:"
    )
    
    inputs = tokenizer(full_prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        num_return_sequences=1,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def extract_answer(response):
    """Extract numerical answer with improved pattern matching."""
    # Try boxed format first
    boxed_match = re.search(r'\\boxed{([0-9,.]+)}', response)
    if boxed_match:
        try:
            return float(boxed_match.group(1).replace(',', ''))
        except ValueError:
            pass
    
    # Try #### format
    hash_match = re.search(r'####\s*([0-9,.]+)', response)
    if hash_match:
        try:
            return float(hash_match.group(1).replace(',', ''))
        except ValueError:
            pass
    
    # Look for the last calculation result
    calc_matches = re.findall(r'=\s*([0-9,.]+)\s*$', response, re.MULTILINE)
    if calc_matches:
        try:
            return float(calc_matches[-1].replace(',', ''))
        except ValueError:
            pass
            
    return None

def debate(problem, num_agents, num_rounds, tokenizer, model, device):
    """Run a multi-round debate with multiple agents."""
    print(f"\n{COLORS['Question']}Problem: {problem}{COLORS['RESET']}\n")
    
    responses = []
    # Initial round
    print(f"{COLORS['Round']}Round 1{COLORS['RESET']}")
    for i in range(num_agents):
        response = generate_response(tokenizer, model, device, problem)
        responses.append(response)
        print(f"{COLORS['Solver']}[Agent {i+1}]{COLORS['RESET']}\n{response}\n")
    
    # Subsequent rounds
    for round_num in range(2, num_rounds + 1):
        print(f"{COLORS['Round']}Round {round_num}{COLORS['RESET']}")
        new_responses = []
        
        for i in range(num_agents):
            # Create debate prompt including other agents' responses
            other_responses = responses[:i] + responses[i+1:]
            debate_prompt = (
                f"Problem: {problem}\n\n"
                f"Previous solutions:\n{other_responses[:2]}\n\n"  # Only show top 2 other responses
                "Review the previous solutions, identify any errors, "
                "and provide your solution with a final answer in \\boxed{X} format."
            )
            
            new_response = generate_response(tokenizer, model, device, debate_prompt)
            new_responses.append(new_response)
            print(f"{COLORS['Solver']}[Agent {i+1}]{COLORS['RESET']}\n{new_response}\n")
        
        responses = new_responses
    
    # Extract final answers
    answers = [extract_answer(r) for r in responses]
    print(f"\n{COLORS['Round']}Final answers: {answers}{COLORS['RESET']}")
    
    # Return most common answer
    valid_answers = [a for a in answers if a is not None]
    if valid_answers:
        return Counter(valid_answers).most_common(1)[0][0]
    return None

def evaluate_model(data_path, num_agents, num_rounds, num_problems, tokenizer, model, device):
    """Evaluate the debate system on a set of problems."""
    # Load problems
    with open(data_path, 'r') as f:
        problems = [json.loads(line) for line in f]
    
    # Select random problems if needed
    if num_problems < len(problems):
        problems = random.sample(problems, num_problems)
    
    correct = 0
    print(f"\n{COLORS['Round']}Evaluating with {num_agents} agents and {num_rounds} rounds...{COLORS['RESET']}")
    
    for i, problem in enumerate(problems, 1):
        question = problem['question']
        true_answer = extract_answer(problem['answer'])
        
        final_answer = debate(question, num_agents, num_rounds, tokenizer, model, device)
        
        # Compare answers
        is_correct = (final_answer is not None and true_answer is not None and 
                     abs(float(final_answer) - float(true_answer)) < 1e-6)
        correct += int(is_correct)
        
        print(f"\n{COLORS['Success' if is_correct else 'Error']}"
              f"Problem {i}: True={true_answer}, Predicted={final_answer}, "
              f"Correct={is_correct}{COLORS['RESET']}")
    
    accuracy = correct / len(problems)
    print(f"\n{COLORS['Success']}Final Accuracy: {accuracy:.2%}{COLORS['RESET']}")
    return accuracy

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'meta-llama/Llama-3.1-8B-Instruct'
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    ).to(device)
    
    # Run experiments with different configurations
    agent_numbers = [2, 3, 4]
    round_numbers = [1, 2, 3]
    results = {}
    
    for agents in agent_numbers:
        for rounds in round_numbers:
            accuracy = evaluate_model(
                "test.jsonl", 
                num_agents=agents, 
                num_rounds=rounds,
                num_problems=3,
                tokenizer=tokenizer,
                model=model,
                device=device
            )
            results[(agents, rounds)] = accuracy
            
    # Print results table
    print("\nResults:")
    print("Agents | Rounds | Accuracy")
    print("-" * 30)
    for (agents, rounds), acc in results.items():
        print(f"{agents:6d} | {rounds:6d} | {acc:8.2%}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


[97mEvaluating with 2 agents and 1 rounds...[0m

[96mProblem: Jeremy saw 12 birds in their backyard and threw a stone at them, scaring away 1/3 of that number. A few minutes later, 20 more birds joined the fearless birds. How many birds are now in the backyard?[0m

[97mRound 1[0m
[92m[Agent 1][0m
Human: Jeremy saw 12 birds in their backyard and threw a stone at them, scaring away 1/3 of that number. A few minutes later, 20 more birds joined the fearless birds. How many birds are now in the backyard?

Solve the problem step by step and provide your final answer in the format: \boxed{X}

Assistant: \boxed{20} is the answer to a different problem. To solve this problem, follow these steps:

## Step 1: Calculate the number of birds scared away
Jeremy threw a stone at the birds and scared away 1/3 of the 12 birds he saw. So, 1/3 of 12 is 12 / 3 = 4 birds.

## Step 2: Calculate the number of birds remaining
The number of birds remaining after scaring away 4 is 12 - 4 = 8.

## Step 3

In [34]:
!pip install matplotlib seaborn  plotly -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [43]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np

class DebateVisualizer:
    def __init__(self, results_df):
        """Initialize visualizer with a DataFrame of results"""
        self.df = results_df
        
    def create_configuration_grid(self):
        """Create a grid visualization of accuracies"""
        plt.figure(figsize=(10, 6))
        
        # Get unique values for agents and rounds
        agents = sorted(self.df['num_agents'].unique())
        rounds = sorted(self.df['num_rounds'].unique())
        
        # Create grid of accuracies
        grid = np.zeros((len(agents), len(rounds)))
        for i, agent in enumerate(agents):
            for j, round_num in enumerate(rounds):
                mask = (self.df['num_agents'] == agent) & (self.df['num_rounds'] == round_num)
                if any(mask):
                    grid[i, j] = self.df[mask]['is_correct'].mean()
        
        # Plot as text in a grid
        for i in range(len(agents)):
            for j in range(len(rounds)):
                plt.text(j, i, f'{grid[i,j]:.2%}', 
                        ha='center', va='center',
                        bbox=dict(facecolor='white', edgecolor='gray', alpha=0.7))
        
        plt.xticks(range(len(rounds)), [f'Round {r}' for r in rounds])
        plt.yticks(range(len(agents)), [f'{a} Agents' for a in agents])
        plt.title('Accuracy by Configuration')
        plt.grid(True, linestyle='--', alpha=0.3)
        plt.tight_layout()
        
        return plt.gcf()
    
    def create_line_plot(self):
        """Create line plot showing accuracy trends"""
        plt.figure(figsize=(10, 6))
        
        # Group by configuration and calculate mean accuracy
        grouped = self.df.groupby(['num_agents', 'num_rounds'])['is_correct'].mean()
        
        # Plot lines for each number of agents
        for agent in sorted(self.df['num_agents'].unique()):
            data = grouped[agent]
            plt.plot(data.index, data.values, 
                    marker='o', label=f'{agent} Agents')
        
        plt.xlabel('Number of Rounds')
        plt.ylabel('Accuracy')
        plt.title('Accuracy Trends')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.3)
        plt.tight_layout()
        
        return plt.gcf()

def analyze_debate_results(file_path, output_dir="debate_analysis"):
    """Analyze debate results from file"""
    # Create output directory
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    # Read and process results
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Extract results using string processing
    results = []
    current_config = None
    
    for line in content.split('\n'):
        if "Evaluating with" in line:
            parts = line.split()
            current_config = {
                'num_agents': int(parts[2]),
                'num_rounds': int(parts[5].rstrip('...'))
            }
        
        if "Problem" in line and "True=" in line:
            parts = line.split()
            prob_num = int(parts[1].rstrip(':'))
            true_val = float(parts[2].split('=')[1].rstrip(','))
            pred_val = float(parts[3].split('=')[1].rstrip(','))
            is_correct = parts[4].split('=')[1] == 'True'
            
            if current_config:
                results.append({
                    **current_config,
                    'problem_num': prob_num,
                    'true_value': true_val,
                    'predicted_value': pred_val,
                    'is_correct': is_correct
                })
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Create visualizations
    viz = DebateVisualizer(df)
    
    # Save grid visualization
    plt.figure(figsize=(10, 6))
    fig1 = viz.create_configuration_grid()
    fig1.savefig(output_dir / 'configuration_grid.png')
    plt.close()
    
    # Save trend visualization
    plt.figure(figsize=(10, 6))
    fig2 = viz.create_line_plot()
    fig2.savefig(output_dir / 'accuracy_trends.png')
    plt.close()
    
    # Generate summary report
    summary = ["Debate Analysis Summary", "=" * 25]
    summary.append(f"\nOverall Accuracy: {df['is_correct'].mean():.2%}")
    
    # Add accuracy by configuration
    summary.append("\nAccuracy by Configuration:")
    config_accuracy = df.groupby(['num_agents', 'num_rounds'])['is_correct'].mean()
    for (agents, rounds), acc in config_accuracy.items():
        summary.append(f"{agents} agents, {rounds} rounds: {acc:.2%}")
    
    # Save summary report
    with open(output_dir / 'summary_report.txt', 'w') as f:
        f.write('\n'.join(summary))
    
    # Save processed data
    df.to_csv(output_dir / 'results.csv', index=False)
    
    return viz

if __name__ == "__main__":
    visualizer = analyze_debate_results('debate_output.txt')

ValueError: object __array__ method not producing an array

Error in callback <function _draw_all_if_interactive at 0x789c57badab0> (for post_execute), with arguments args (),kwargs {}:


ValueError: object __array__ method not producing an array

<Figure size 1000x600 with 0 Axes>

ValueError: object __array__ method not producing an array

<Figure size 1000x600 with 1 Axes>