In [1]:
!pip install trelis

Collecting trelis
  Downloading trelis-1.3.0-py3-none-any.whl.metadata (7.7 kB)
Downloading trelis-1.3.0-py3-none-any.whl (26 kB)
Installing collected packages: trelis
Successfully installed trelis-1.3.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
!pip install 'accelerate>=0.26.0' torch transformers

Collecting accelerate>=0.26.0
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.21.0 (from accelerate>=0.26.0)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.3 (from accelerate>=0.26.0)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x8

In [2]:
from huggingface_hub import login
import os

# Login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

# Define color codes for each role
COLORS = {
    'Solver': '\033[92m',  # Green
    'Critic': '\033[93m',  # Yellow
    'Judge': '\033[94m',   # Blue
    'RESET': '\033[0m'     # Reset color
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

logging.basicConfig(level=logging.INFO)

class Agent:
    def __init__(self, name, role, model_name=None, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=512):
        self.name = name
        self.role = role  # Solver, Critic, or Judge
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p

        if model is not None and tokenizer is not None:
            self.model = model.to(self.device)
            self.tokenizer = tokenizer
        elif model_name is not None:
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16 if self.device.type == 'cuda' else torch.float32
                ).to(self.device)
            except Exception as e:
                logging.error(f"Failed to load model {model_name}: {e}")
                raise
        else:
            raise ValueError("Either model and tokenizer or model_name must be provided")

        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []
        self.score = 1.0

    def generate_response(self, messages):
        """
        Generate a response from the model based on the input messages.
        """
        prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
        prompt += f"\n{self.role}:"
        try:
            response = self.pipeline(prompt)[0]['generated_text']
            response = response[len(prompt):].strip()
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."
        return response

def construct_message(agent, previous_responses, question):
    """
    Construct a message for the agent based on its role.
    """
    if agent.role == 'Solver':
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Solver. Your task is to solve the following problem in detail, providing clear and complete explanations, including any mathematical proofs and examples where appropriate."},
            {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your detailed solution before anyone else responds."}
        ]
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Critic. Provide a detailed critique of the solution provided by the Solver, pointing out any errors or areas for improvement, and offering suggestions for correction."},
            {'role': 'user', 'content': f"The Solver has presented the following solution:\n{responses_summary}\nProvide your comprehensive critique."}
        ]
    elif agent.role == 'Judge':
        solver_response, critic_response = previous_responses
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Judge. Evaluate the solution provided by the Solver and the critique provided by the Critic in detail. Assess the correctness of the solution, the validity of the critique, and provide a final verdict with explanations."},
            {'role': 'user', 'content': f"Solver's solution:\n{solver_response}\n\nCritic's critique:\n{critic_response}\n\nProvide your detailed evaluation and final decision."}
        ]

def run_debate(agents, question, rounds=1):
    """
    Run a multi-agent debate where agents respond in a controlled sequential order.
    """
    solver = next(agent for agent in agents if agent.role == 'Solver')
    critic = next(agent for agent in agents if agent.role == 'Critic')
    judge = next(agent for agent in agents if agent.role == 'Judge')

    # Print the question in white
    print(f"\n{COLORS['RESET']}Question: {question}\n")

    # Step 1: Solver provides the solution
    solver_messages = construct_message(solver, [], question)
    solver_response = solver.generate_response(solver_messages)
    ColoredLogger.print_colored('Solver', solver.name, solver_response)

    # Step 2: Critic critiques the solution
    critic_messages = construct_message(critic, [solver_response], question)
    critic_response = critic.generate_response(critic_messages)
    ColoredLogger.print_colored('Critic', critic.name, critic_response)

    # Step 3: Continue the debate if there are more rounds
    for round_num in range(2, rounds + 1):
        print(f"\n{COLORS['RESET']}=== Round {round_num} ===\n")
        
        # Solver may refine the solution based on critique
        solver_messages = construct_message(solver, [critic_response], question)
        solver_response = solver.generate_response(solver_messages)
        ColoredLogger.print_colored('Solver', solver.name, solver_response)

        # Critic responds with further critique
        critic_messages = construct_message(critic, [solver_response], question)
        critic_response = critic.generate_response(critic_messages)
        ColoredLogger.print_colored('Critic', critic.name, critic_response)

    # Step 4: Judge evaluates the final responses
    print(f"\n{COLORS['RESET']}=== Final Judgment ===\n")
    judge_messages = construct_message(judge, [solver_response, critic_response], question)
    judge_response = judge.generate_response(judge_messages)
    ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    ).to(device)
except Exception as e:
    logging.error(f"Failed to load model {model_name}: {e}")
    raise

agents = [
    Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent2', role='Critic', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent3', role='Judge', model=model, tokenizer=tokenizer, device=device),
]

question = "What is the sum of even numbers from 1 to 100?"
result = run_debate(agents, question)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]


[0mQuestion: What is the sum of even numbers from 1 to 100?

[92m[Solver - Agent1][0m
[92mTo solve this problem, we will use a mathematical formula to calculate the sum of even numbers from 1 to 100. The formula to calculate the sum of even numbers from 1 to n is: sum = (n/2) * (first even number + last even number).

## Step 1: Determine the first and last even numbers
The first even number is 2, and the last even number is 100.

## Step 2: Apply the formula
Using the formula sum = (n/2) * (first even number + last even number), we substitute n with 100 (since we are calculating the sum of even numbers from 1 to 100) and the first and last even numbers with 2 and 100 respectively.

## Step 3: Perform the calculation
sum = (100/2) * (2 + 100)
sum = 50 * 102
sum = 5100

The final answer is: $\boxed{5100}$[0m

[93m[Critic - Agent2][0m
[93mI have reviewed the solution provided by the Solver, and I have identified several areas of concern. Firstly, the formula used to calculate th

- First Round (Independent Responses):

In the first round, each agent (Solver, Critic, and Judge) independently provides their initial answer to the problem without relying on the others.

- Subsequent Rounds (Refinement):

In subsequent rounds, each agent refines their response by considering the critiques and solutions provided by other agents in previous rounds. This encourages the agents to converge on a more accurate final answer.

- Multiple Agents for the Same Role:

To closely follow the debate model from the paper, you can introduce multiple solvers and critics, allowing more diverse viewpoints and feedback during the debate process.

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

# Define color codes for each role and formatting
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[95m',     # Magenta for round headers
    'Question': '\033[96m',  # Cyan for questions
    'RESET': '\033[0m'       # Reset color
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_question(question):
        print(f"{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")

logging.basicConfig(level=logging.INFO)

class Agent:
    def __init__(self, name, role, model_name=None, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=512):
        self.name = name
        self.role = role
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p

        if model is not None and tokenizer is not None:
            self.model = model.to(self.device)
            self.tokenizer = tokenizer
        elif model_name is not None:
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16 if self.device.type == 'cuda' else torch.float32
                ).to(self.device)
            except Exception as e:
                logging.error(f"Failed to load model {model_name}: {e}")
                raise
        else:
            raise ValueError("Either model and tokenizer or model_name must be provided")

        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []
        self.score = 1.0

    def generate_response(self, messages):
        """
        Generate a response from the model based on the input messages.
        """
        prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
        prompt += f"\n{self.role}:"
        try:
            response = self.pipeline(prompt)[0]['generated_text']
            response = response[len(prompt):].strip()
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."
        return response

def construct_message(agent, previous_responses, question, round_num=1):
    """
    Construct a message for the agent based on its role and round of debate.
    """
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': f"You are {agent.name}, a Solver. Provide a detailed solution to the problem."},
                {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your detailed solution."}
            ]
        else:
            responses_summary = "\n".join([f"Critic's critique: {resp}" for resp in previous_responses])
            return [
                {'role': 'system', 'content': f"You are {agent.name}, a Solver. Revise your solution based on the feedback provided by the Critic."},
                {'role': 'user', 'content': f"The Critic provided the following feedback:\n{responses_summary}\nPlease refine your solution accordingly."}
            ]
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Critic. Critique the solution provided by the Solver."},
            {'role': 'user', 'content': f"The Solver provided the following solution:\n{responses_summary}\nProvide your detailed critique."}
        ]
    elif agent.role == 'Judge':
        solver_response, critic_response = previous_responses
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Judge. Evaluate the solution and critique provided."},
            {'role': 'user', 'content': f"Solver's solution:\n{solver_response}\n\nCritic's critique:\n{critic_response}\nProvide your detailed evaluation and final decision."}
        ]

def run_debate(agents, question, rounds=3):
    """
    Run a multi-agent debate where agents respond in multiple rounds.
    """
    # Print the initial question
    ColoredLogger.print_question(question)
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        
        solver = next(agent for agent in agents if agent.role == 'Solver')
        critic = next(agent for agent in agents if agent.role == 'Critic')
        judge = next(agent for agent in agents if agent.role == 'Judge')

        # Step 1: Solver provides or refines the solution
        solver_messages = construct_message(solver, [], question, round_num)
        solver_response = solver.generate_response(solver_messages)
        ColoredLogger.print_colored('Solver', solver.name, solver_response)

        # Step 2: Critic critiques the solution
        critic_messages = construct_message(critic, [solver_response], question)
        critic_response = critic.generate_response(critic_messages)
        ColoredLogger.print_colored('Critic', critic.name, critic_response)

        # Step 3: Judge evaluates the final responses (only after the last round)
        if round_num == rounds:
            print(f"\n{COLORS['Round']}{'='*20} Final Judgment {'='*20}{COLORS['RESET']}\n")
            judge_messages = construct_message(judge, [solver_response, critic_response], question)
            judge_response = judge.generate_response(judge_messages)
            ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'meta-llama/Llama-3.1-8B-Instruct'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Critic', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent3', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    question = "What is the sum of even numbers from 1 to 100?"
    result = run_debate(agents, question, rounds=3)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[96mQuestion: What is the sum of even numbers from 1 to 100?[0m



[92m[Solver - Agent1][0m
[92mI will provide a step-by-step solution to the problem.

Step 1: Identify the range of numbers to consider
The problem asks for the sum of even numbers from 1 to 100. Therefore, I will consider the range of numbers from 2 to 100 (since 1 is not an even number).

Step 2: Determine the sequence of even numbers
The sequence of even numbers in the given range is: 2, 4, 6, 8,..., 100.

Step 3: Identify the pattern of the sequence
The sequence of even numbers is an arithmetic sequence with a common difference of 2.

Step 4: Find the number of terms in the sequence
To find the number of terms in the sequence, I will use the formula: n = (last term - first term) / common difference + 1.
n = (100 - 2) / 2 + 1
n = 98 / 2 + 1
n = 49 + 1
n = 50

Step 5: Calculate the sum of the sequence
To calculate the sum of the sequence, I will use the formula: sum = (n/2)(first term + last term)
sum = (50/2)(2 +

- Initial Round - Independent Solutions:

In the first round, multiple agents provide independent solutions (like the Solver role).

- Subsequent Rounds - Critique and Refinement:

In later rounds, agents refine their answers based on the critiques and solutions provided by other agents.

- Judge Role:

The Judge will step in only after multiple rounds have been completed to provide a final evaluation, rather than after every round.

## Adding a fix for derailment 

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

# Define color codes for each role and formatting
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[95m',     # Magenta for round headers
    'Question': '\033[96m',  # Cyan for questions
    'Warning': '\033[91m',   # Red for warnings
    'RESET': '\033[0m'       # Reset color
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_question(question):
        print(f"{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")

    @staticmethod
    def print_warning(message):
        print(f"{COLORS['Warning']}Warning: {message}{COLORS['RESET']}\n")

def construct_message(agent, previous_responses, question, round_num=1):
    """
    Construct a message for the agent based on its role and round of debate.
    Includes explicit instructions to stay focused on the original question.
    """
    original_question_reminder = (
        f"Important: Stay focused on the original question: '{question}'. "
        "Do not introduce unrelated concepts or deviate from the core mathematical problem."
    )
    
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': f"""You are {agent.name}, a Solver. {original_question_reminder}
                Provide a clear mathematical solution with step-by-step reasoning. Focus only on concepts directly 
                related to solving this specific problem."""},
                {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your detailed solution."}
            ]
        else:
            responses_summary = "\n".join([f"Critic's critique: {resp}" for resp in previous_responses])
            return [
                {'role': 'system', 'content': f"""You are {agent.name}, a Solver. {original_question_reminder}
                Revise your solution based on the Critic's feedback, but maintain focus on the original mathematical problem.
                Do not introduce concepts unrelated to the core problem."""},
                {'role': 'user', 'content': f"The Critic provided the following feedback:\n{responses_summary}\nPlease refine your solution accordingly."}
            ]
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Critic. {original_question_reminder}
            Evaluate the mathematical correctness and clarity of the solution. If the solution deviates from
            the original question, point this out as a critical issue."""},
            {'role': 'user', 'content': f"The Solver provided the following solution:\n{responses_summary}\nProvide your detailed critique."}
        ]
    elif agent.role == 'Judge':
        solver_response, critic_response = previous_responses
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Judge. {original_question_reminder}
            Evaluate whether both the solution and critique stayed focused on the original question.
            If either party deviated from the core mathematical problem, this should be reflected in your evaluation."""},
            {'role': 'user', 'content': f"Solver's solution:\n{solver_response}\n\nCritic's critique:\n{critic_response}\nProvide your detailed evaluation and final decision."}
        ]

def check_topic_drift(response, original_question):
    """
    Check if the response has drifted from the original mathematical topic.
    Returns True if significant drift is detected.
    """
    # List of keywords that suggest topic drift
    drift_keywords = [
        'regression', 'data analysis', 'linear programming', 
        'constraints', 'objective function', 'non-linear',
        'variables x', 'variables y', 'variables z'
    ]
    
    # Core mathematical keywords that should be present
    math_keywords = [
        'sum', 'even numbers', 'arithmetic', 'sequence',
        'series', 'addition', 'numbers'
    ]
    
    response_lower = response.lower()
    
    # Check for presence of drift keywords
    drift_detected = any(keyword in response_lower for keyword in drift_keywords)
    
    # Check for absence of mathematical keywords
    math_focus = any(keyword in response_lower for keyword in math_keywords)
    
    return drift_detected or not math_focus

def run_debate(agents, question, rounds=3):
    """
    Run a multi-agent debate where agents respond in multiple rounds.
    Now includes topic drift detection and warnings.
    """
    ColoredLogger.print_question(question)
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        
        solver = next(agent for agent in agents if agent.role == 'Solver')
        critic = next(agent for agent in agents if agent.role == 'Critic')
        judge = next(agent for agent in agents if agent.role == 'Judge')

        # Step 1: Solver provides or refines the solution
        solver_messages = construct_message(solver, [], question, round_num)
        solver_response = solver.generate_response(solver_messages)
        
        # Check for topic drift in solver's response
        if check_topic_drift(solver_response, question):
            ColoredLogger.print_warning("Solver's response may have deviated from the original mathematical problem.")
        
        ColoredLogger.print_colored('Solver', solver.name, solver_response)

        # Step 2: Critic critiques the solution
        critic_messages = construct_message(critic, [solver_response], question)
        critic_response = critic.generate_response(critic_messages)
        
        # Check for topic drift in critic's response
        if check_topic_drift(critic_response, question):
            ColoredLogger.print_warning("Critic's response may have deviated from the original mathematical problem.")
            
        ColoredLogger.print_colored('Critic', critic.name, critic_response)

        # Step 3: Judge evaluates the final responses (only after the last round)
        if round_num == rounds:
            print(f"\n{COLORS['Round']}{'='*20} Final Judgment {'='*20}{COLORS['RESET']}\n")
            judge_messages = construct_message(judge, [solver_response, critic_response], question)
            judge_response = judge.generate_response(judge_messages)
            
            # Check for topic drift in judge's response
            if check_topic_drift(judge_response, question):
                ColoredLogger.print_warning("Judge's response may have deviated from the original mathematical problem.")
                
            ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'meta-llama/Llama-3.1-8B-Instruct'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Critic', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent3', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    question = "What is the sum of even numbers from 1 to 100?"
    result = run_debate(agents, question, rounds=3)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[96mQuestion: What is the sum of even numbers from 1 to 100?[0m



[92m[Solver - Agent1][0m
[92mTo solve the problem, let's break it down into steps.

Step 1: Identify the sequence of even numbers from 1 to 100.
The sequence of even numbers from 1 to 100 is 2, 4, 6, 8,..., 100.

Step 2: Determine the number of terms in the sequence.
Since the sequence starts at 2 and ends at 100, with a common difference of 2, we can use the formula for the nth term of an arithmetic sequence to find the number of terms. The formula is: a_n = a_1 + (n-1)d, where a_n is the nth term, a_1 is the first term, n is the number of terms, and d is the common difference.

Rearrange the formula to solve for n: n = (a_n - a_1)/d + 1

Substitute the values: n = (100 - 2)/2 + 1
n = 98/2 + 1
n = 49 + 1
n = 50

There are 50 terms in the sequence.

Step 3: Find the sum of the sequence using the formula for the sum of an arithmetic series.
The formula for the sum of an arithmetic series is: S_n = (n/2)(a_1 + a_n), 

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

logging.basicConfig(level=logging.INFO)

class Agent:
    def __init__(self, name, role, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=512):
        self.name = name
        self.role = role  # Solver, Critic, or Judge
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p

        # Use the provided model and tokenizer
        self.model = model.to(self.device)
        self.tokenizer = tokenizer

        # Set up the pipeline with flexible parameters
        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []

    def generate_response(self, messages):
        """
        Generate a response from the model based on the input messages.
        """
        prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
        prompt += f"\n{self.role}:"
        try:
            response = self.pipeline(prompt)[0]['generated_text']
            # Extract the assistant's response
            response = response[len(prompt):].strip()
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."
        return response

def construct_message(agent, previous_responses, question):
    """
    Construct a message for the agent based on its role.
    """
    if agent.role == 'Solver':
        # Solver provides independent solutions in the first round
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Solver. Solve the problem independently and in detail."},
            {'role': 'user', 'content': f"The problem to solve is: '{question}'. Please provide your solution before others respond."}
        ]
    elif agent.role == 'Critic':
        # Critic provides detailed feedback in subsequent rounds
        responses_summary = "\n".join([f"Other agent's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Critic. Evaluate and critique the solutions provided by other agents. Suggest improvements."},
            {'role': 'user', 'content': f"The other agents have presented the following solutions:\n{responses_summary}\nProvide your critique and suggestions for improvement."}
        ]
    elif agent.role == 'Judge':
        # Judge evaluates and delivers a final verdict after all rounds
        solver_responses = "\n".join([f"Solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"You are {agent.name}, a Judge. Evaluate all the solutions and critiques provided by other agents, and deliver a final decision."},
            {'role': 'user', 'content': f"The agents have provided the following solutions and critiques:\n{solver_responses}\nProvide your evaluation and final decision."}
        ]

def run_debate(agents, question, rounds=3):
    """
    Run a multi-agent debate where agents respond in multiple rounds.
    """
    all_responses = []  # Store responses from each round

    # Round 1: Each agent provides an independent solution
    for agent in agents:
        if agent.role == 'Solver':
            solver_messages = construct_message(agent, [], question)
            solver_response = agent.generate_response(solver_messages)
            all_responses.append(solver_response)
            print(f"{agent.name} (Solver):\n{solver_response}\n")

    # Rounds 2+: Each agent refines based on other agents' solutions
    for round_num in range(2, rounds + 1):
        print(f"--- Round {round_num} ---")
        for agent in agents:
            if agent.role == 'Critic':  # Critic role steps in to review all previous responses
                critic_messages = construct_message(agent, all_responses, question)
                critic_response = agent.generate_response(critic_messages)
                all_responses.append(critic_response)
                print(f"{agent.name} (Critic):\n{critic_response}\n")

            elif agent.role == 'Solver':  # Solver refines their solution based on critiques
                solver_messages = construct_message(agent, all_responses, question)
                solver_response = agent.generate_response(solver_messages)
                all_responses.append(solver_response)
                print(f"{agent.name} (Solver):\n{solver_response}\n")

    # Final Step: Judge evaluates after all rounds
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_messages = construct_message(judge, all_responses, question)
    judge_response = judge.generate_response(judge_messages)
    print(f"{judge.name} (Judge):\n{judge_response}\n")

    return judge_response

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model name
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

# Load the model and tokenizer once
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    ).to(device)
except Exception as e:
    logging.error(f"Failed to load model {model_name}: {e}")
    raise

# Create agents with the same model and tokenizer
agents = [
    Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent3', role='Critic', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent4', role='Judge', model=model, tokenizer=tokenizer, device=device),
]

# Define the problem to solve
question = "What is the sum of even numbers from 1 to 100?"

# Run the multi-agent debate
result = run_debate(agents, question)
print(f"Final decision: {result}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Agent1 (Solver):
To solve this problem, I will first determine the sequence of even numbers from 1 to 100. The sequence is 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100.
Next, I will find the number of terms in the sequence. The number of terms in the sequence is 50.
Now, I will use the formula for the sum of an arithmetic series to calculate the sum of even numbers from 1 to 100. The formula is: Sum = (n/2)(a + l), where n is the number of terms, a is the first term, and l is the last term. In this case, n = 50, a = 2, and l = 100.
Substituting these values into the formula, I get: Sum = (50/2)(2 + 100) = (25)(102) = 2550.
Therefore, the sum of even numbers from 1 to 100 is 2550. Thank you for the opportunity to solve this problem. Is there anything else I can help you with? (Solver paused for a moment) I have double-checked my solution 

# Which is Better?

## First Version (Simpler, Sequential, Role-Specific):
- Best for: If you want a clear and structured debate where roles are well-defined and each agent has a unique responsibility, this version is better. It’s modular, easy to extend, and straightforward to follow. It fits well for scenarios where each agent specializes in a specific task, and there is a clear flow from solving to critiquing to judging.
- Ideal Use Case: If you want to start with a more structured and deterministic approach where the debate evolves in a controlled manner.
    
## Second Version (Flexible, Multiple Solvers, Dynamic):
- Best for: If you want more diversity in the debate, where multiple Solvers can independently propose solutions and receive critiques, this version is better. It’s more flexible and can scale easily with more agents of the same role.
- Ideal Use Case: If the goal is to simulate more complex debates where multiple agents propose competing solutions and critiques, leading to a richer exchange of ideas.
    
## Conclusion:
If we want simplicity, clear role definitions, and easier maintenance, we should go with the First Version.
If we need more flexibility and diversity of ideas (with multiple agents of the same role), the Second Version is better.

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging
import json
import numpy as np
import re

# Define color codes for better visualization
COLORS = {
    'Solver1': '\033[92m',   # Green
    'Solver2': '\033[96m',   # Cyan
    'Solver3': '\033[95m',   # Magenta
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        role_key = role if role in COLORS else role[:6]  # Handle Solver1, Solver2, etc.
        color = COLORS.get(role_key, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_question(question, options):
        print(f"{COLORS['Question']}Question: {question}")
        print(f"Options:\n{options}{COLORS['RESET']}\n")

    @staticmethod
    def print_warning(message):
        print(f"{COLORS['Warning']}Warning: {message}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

logging.basicConfig(level=logging.INFO)

def check_topic_drift(response, question, options):
    """Check if response has drifted from the multiple-choice focus."""
    # Check if the response contains any answer choice
    contains_answer = bool(re.search(r'\([A-D]\)', response))
    
    # Check if the response addresses the specific question
    addresses_question = any(keyword.lower() in response.lower() 
                           for keyword in question.lower().split())
    
    # Check if the response references the options
    references_options = any(option.lower() in response.lower() 
                           for option in options.lower().split(", "))
    
    return not (contains_answer and addresses_question and references_options)

class Agent:
    def __init__(self, name, role, model=None, tokenizer=None, device=None, temperature=0.7, top_p=0.9, max_new_tokens=256):
        self.name = name
        self.role = role
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature
        self.top_p = top_p
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        
        self.pipeline = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device.type == 'cuda' else -1,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=self.temperature,
            top_p=self.top_p,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )
        self.history = []

    def generate_response(self, messages):
        question = messages[1]['content'].split("'")[1]
        options = messages[1]['content'].split("options:")[1].strip() if "options:" in messages[1]['content'] else ""
        
        prompt = "\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])
        prompt += f"\n{self.role}:"
        try:
            response = self.pipeline(prompt)[0]['generated_text']
            response = response[len(prompt):].strip()
            
            if check_topic_drift(response, question, options):
                ColoredLogger.print_warning(f"{self.role}'s response may have drifted from the question focus.")
            
            return response
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."

def construct_message(agent, previous_responses, question, options):
    """Construct a focused message for each agent role."""
    formatted_options = "\n".join([f"({chr(65 + i)}) {option.strip()}" for i, option in enumerate(options.split(", "))])
    
    focus_reminder = (
        f"Important: Stay focused on selecting and justifying one of the provided options: "
        f"(A), (B), (C), or (D). Your response must include your choice in parentheses."
    )
    
    if agent.role == 'Solver':
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Solver. {focus_reminder}
            Analyze the question carefully and select the best answer from the options.
            Explain your reasoning briefly but clearly, and ensure your response includes
            your selected answer in the format (A), (B), (C), or (D)."""},
            {'role': 'user', 'content': f"Question: '{question}'\nOptions:\n{formatted_options}\nProvide your answer and explanation."}
        ]
    
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Critic. {focus_reminder}
            Review the Solvers' answers and their reasoning. Evaluate their logic and
            provide your own answer choice with justification."""},
            {'role': 'user', 'content': f"Previous solutions:\n{responses_summary}\nOptions:\n{formatted_options}\nProvide your critique and answer choice."}
        ]
    
    elif agent.role == 'Judge':
        all_responses = "\n".join([f"Response #{i+1}: {resp}" for i, resp in enumerate(previous_responses)])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Judge. {focus_reminder}
            Consider all previous responses and make a final decision. Your response must
            clearly state your chosen answer in the format (A), (B), (C), or (D)."""},
            {'role': 'user', 'content': f"All responses:\n{all_responses}\nOptions:\n{formatted_options}\nProvide your final decision."}
        ]

def run_debate(agents, question, options, rounds=3):
    """Run a focused debate for multiple-choice questions."""
    all_responses = []
    
    ColoredLogger.print_question(question, options)
    
    # Round 1: Initial solutions from Solvers
    ColoredLogger.print_round(1)
    for agent in agents:
        if agent.role == 'Solver':
            solver_messages = construct_message(agent, [], question, options)
            solver_response = agent.generate_response(solver_messages)
            all_responses.append(solver_response)
            ColoredLogger.print_colored(f"{agent.role}", agent.name, solver_response)

    # Subsequent rounds
    for round_num in range(2, rounds + 1):
        ColoredLogger.print_round(round_num)
        
        # Critic evaluation
        critics = [agent for agent in agents if agent.role == 'Critic']
        for critic in critics:
            critic_messages = construct_message(critic, all_responses, question, options)
            critic_response = critic.generate_response(critic_messages)
            all_responses.append(critic_response)
            ColoredLogger.print_colored('Critic', critic.name, critic_response)

        # Solvers refinement
        solvers = [agent for agent in agents if agent.role == 'Solver']
        for solver in solvers:
            solver_messages = construct_message(solver, all_responses, question, options)
            solver_response = solver.generate_response(solver_messages)
            all_responses.append(solver_response)
            ColoredLogger.print_colored('Solver', solver.name, solver_response)

    # Final judgment
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_messages = construct_message(judge, all_responses, question, options)
    judge_response = judge.generate_response(judge_messages)
    ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response

def parse_answer(input_str):
    """Parse the model's output to extract the multiple-choice answer."""
    pattern = r'\(([A-D])\)'
    matches = re.findall(pattern, input_str)
    if matches:
        return f"({matches[0].upper()})"
    return None

def evaluate_on_mmlu(agents, mmlu_data):
    """Evaluate the debate system on MMLU data with enhanced visualization."""
    accuracies = []
    total_questions = len(mmlu_data)
    
    print(f"\n{COLORS['Round']}{'='*20} MMLU Evaluation {'='*20}{COLORS['RESET']}\n")
    
    for i, entry in enumerate(mmlu_data, 1):
        print(f"\n{COLORS['Round']}Question {i}/{total_questions}{COLORS['RESET']}\n")
        
        question = entry['question']
        options = entry['options']
        correct_answer = entry['answer']

        final_decision = run_debate(agents, question, options, rounds=3)
        predicted_answer = parse_answer(final_decision)
        
        accurate = 1 if predicted_answer == correct_answer else 0
        accuracies.append(accurate)
        
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            current_accuracy = np.mean(accuracies)
            print(f"{COLORS['Round']}Current Accuracy: {current_accuracy:.2%}{COLORS['RESET']}\n")

    mean_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {mean_accuracy:.2%}{COLORS['RESET']}\n")
    return mean_accuracy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model name
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

# Load the model and tokenizer once
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
    ).to(device)
except Exception as e:
    logging.error(f"Failed to load model {model_name}: {e}")
    raise

# Create agents with the same model and tokenizer
agents = [
    Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent3', role='Solver', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent4', role='Critic', model=model, tokenizer=tokenizer, device=device),
    Agent(name='Agent5', role='Judge', model=model, tokenizer=tokenizer, device=device),
]

# Load MMLU data (assumed to be in JSON format)
with open("mmlu_data_small.json", "r") as file:
    mmlu_data = json.load(file)

# Run evaluation
evaluate_on_mmlu(agents, mmlu_data)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]




[97mQuestion 1/10[0m

[96mQuestion: What is the capital of Germany?
Options:
(A) Rome, (B) Berlin, (C) Madrid, (D) Paris[0m



[0m[Solver - Agent1][0m
[0m(B) Berlin.
Explanation: Berlin is the capital of Germany, and it has been since the country's reunification in 1990. Prior to reunification, West Berlin was the capital of West Germany, and East Berlin was the capital of East Germany. Berlin has a rich history, having served as the capital of the Holy Roman Empire, the Kingdom of Prussia, and the German Empire. Today, it is a thriving metropolis and a major cultural and economic hub. (B) Berlin is the correct answer. 
The final answer is: (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).  Berlin.  (B).[0m

[0m[Solver - Agent2][0m
[0mT

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[93m[Critic - Agent4][0m
[93mThe provided solvers have correctly identified that the Red Planet is Mars due to its reddish appearance caused by iron oxide in the planet's soil. Their reasoning is sound, and they have effectively applied their knowledge of planetary characteristics to arrive at the correct answer. However, I would like to reiterate the importance of concise and clear communication in their responses. In some cases, their answers were excessively lengthy and included unnecessary information. Nonetheless, the core of their reasoning is correct, and they have effectively selected option (C). I concur with their assessment, and my answer choice is also (C). Mars is indeed the Red Planet, and its distinctive coloration is a result of iron oxide in the soil. Therefore, the correct answer is (C). (C) - Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4.  - 1. Agent4

1.0

In [10]:
!pip install numpy==1.22.4 openai==0.27.6 pandas==1.5.3 tqdm==4.64.1 -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [16]:
def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        return answer_text.split('####')[1].strip()
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response."""
    # Look for numbers in the response
    numbers = re.findall(r'\d+\.?\d*', response)
    if numbers:
        # Return the last number found as it's likely the final answer
        return numbers[-1]
    return None

def evaluate_on_math_problems(agents, data_path):
    """Evaluate the debate system on math problems with numeric answer comparison."""
    accuracies = []
    
    # Read JSONL file
    problems = []
    with open(data_path, 'r') as file:
        for line in file:
            problems.append(json.loads(line))
    
    total_questions = len(problems)
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{total_questions}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])

        # Run the debate without options
        final_decision = run_debate(agents, question, "", rounds=1)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        # Compare numerical answers
        try:
            predicted_num = float(predicted_answer) if predicted_answer else None
            correct_num = float(correct_answer) if correct_answer else None
            accurate = 1 if predicted_num == correct_num else 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            current_accuracy = np.mean(accuracies)
            print(f"{COLORS['Round']}Current Accuracy: {current_accuracy:.2%}{COLORS['RESET']}\n")

    mean_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {mean_accuracy:.2%}{COLORS['RESET']}\n")
    return mean_accuracy

# Modify the message construction for math problems
def construct_message(agent, previous_responses, question, options=None):
    """Construct a focused message for each agent role."""
    focus_reminder = (
        "Important: Solve the math problem step by step and clearly state your final numerical answer. "
        "Make sure to show your work and calculations."
    )
    
    if agent.role == 'Solver':
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Solver. {focus_reminder}
            Break down the problem into steps, show your calculations,
            and provide a clear final numerical answer."""},
            {'role': 'user', 'content': f"Problem: '{question}'\nProvide your solution with calculations."}
        ]
    
    elif agent.role == 'Critic':
        responses_summary = "\n".join([f"Solver's solution: {resp}" for resp in previous_responses])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Critic. {focus_reminder}
            Review the Solvers' calculations and reasoning. Check for mathematical errors
            and provide your own solution if you disagree."""},
            {'role': 'user', 'content': f"Previous solutions:\n{responses_summary}\nVerify the calculations and provide your analysis."}
        ]
    
    elif agent.role == 'Judge':
        all_responses = "\n".join([f"Response #{i+1}: {resp}" for i, resp in enumerate(previous_responses)])
        return [
            {'role': 'system', 'content': f"""You are {agent.name}, a Judge. {focus_reminder}
            Review all calculations and solutions provided. Determine the correct final answer
            based on mathematical accuracy and sound reasoning."""},
            {'role': 'user', 'content': f"All responses:\n{all_responses}\nProvide your final decision with the correct numerical answer."}
        ]

# Update the main execution
if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = 'meta-llama/Llama-3.1-8B-Instruct'

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device.type == 'cuda' else torch.float32
        ).to(device)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        Agent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent3', role='Solver', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent4', role='Critic', model=model, tokenizer=tokenizer, device=device),
        Agent(name='Agent5', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    # Run evaluation on math problems
    evaluate_on_math_problems(agents, "test.jsonl")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]




[97mProblem 1/1319[0m

[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Options:
[0m




[0m[Solver - Agent1][0m
[0mTo solve this problem, we need to break it down into steps.

Step 1: Calculate the total number of eggs laid by Janet's ducks per day.
The ducks lay 16 eggs per day.

Step 2: Calculate the number of eggs Janet eats for breakfast every morning.
She eats 3 eggs for breakfast every morning.

Step 3: Calculate the number of eggs Janet uses to bake muffins for her friends every day.
She uses 4 eggs to bake muffins every day.

Step 4: Calculate the total number of eggs Janet uses (eats and bakes) per day.
Total number of eggs used = 3 (breakfast) + 4 (baking) = 7 eggs per day.

Step 5: Calculate the number of eggs Janet sells 

IndexError: list index out of range