In [1]:
!pip install 'accelerate>=0.26.0' transformers hf_transfer

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import os

# os.environ["HF_ENDPOINT"] = "http://localhost:5564"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/161 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [8]:
response

'A large language model is a type of pre-trained model that has been trained on a large amount of text data. These models are designed to generate human-like text, such as articles, stories, poems, and answering questions. They are also useful for tasks such as translation, text generation, and information抽取. Some popular large language models include BERT, GPT-2, and通义千问.'

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging

# Define color codes for each role
COLORS = {
    'Solver': '\033[92m',  # Green
    'Critic': '\033[93m',  # Yellow
    'Judge': '\033[94m',   # Blue
    'RESET': '\033[0m'     # Reset color
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

class QwenAgent:
    def __init__(self, name, role, model=None, tokenizer=None, device=None, max_new_tokens=512):
        self.name = name
        self.role = role
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.max_new_tokens = max_new_tokens
        
        if model is not None and tokenizer is not None:
            self.model = model
            self.tokenizer = tokenizer
        else:
            raise ValueError("Model and tokenizer must be provided")
        
        self.history = []
        self.score = 1.0

    def generate_response(self, messages):
        """
        Generate a response using Qwen's chat format
        """
        try:
            # Add role-specific system message
            system_message = self._get_system_message()
            chat_messages = [system_message] + messages
            
            # Apply chat template
            text = self.tokenizer.apply_chat_template(
                chat_messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
            generated_ids = self.model.generate(
                **model_inputs,
                max_new_tokens=self.max_new_tokens
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "Sorry, I couldn't generate a response."

    def _get_system_message(self):
        """Get role-specific system message"""
        if self.role == 'Solver':
            content = f"You are {self.name}, a mathematical problem solver. Provide detailed solutions with clear steps and explanations."
        elif self.role == 'Critic':
            content = f"You are {self.name}, a mathematical critic. Analyze solutions carefully and provide constructive criticism."
        else:  # Judge
            content = f"You are {self.name}, a judge. Evaluate solutions and critiques fairly and provide clear verdicts."
        
        return {"role": "system", "content": content}

def construct_message(agent, previous_responses, question):
    """
    Construct messages for the Qwen chat format
    """
    if agent.role == 'Solver':
        return [
            {"role": "user", "content": f"Solve this problem step by step: {question}"}
        ]
    elif agent.role == 'Critic':
        solver_solution = previous_responses[0]
        return [
            {"role": "user", "content": f"Problem: {question}\nSolver's solution:\n{solver_solution}\n\nProvide a detailed critique of this solution."}
        ]
    else:  # Judge
        solver_solution, critic_critique = previous_responses
        return [
            {"role": "user", "content": f"Problem: {question}\nSolver's solution:\n{solver_solution}\n\nCritic's critique:\n{critic_critique}\n\nEvaluate both the solution and critique, then provide your final verdict."}
        ]

def run_debate(agents, question, rounds=1):
    """
    Run a multi-agent debate with Qwen agents
    """
    solver = next(agent for agent in agents if agent.role == 'Solver')
    critic = next(agent for agent in agents if agent.role == 'Critic')
    judge = next(agent for agent in agents if agent.role == 'Judge')

    print(f"\n{COLORS['RESET']}Question: {question}\n")

    # Step 1: Solver provides solution
    solver_messages = construct_message(solver, [], question)
    solver_response = solver.generate_response(solver_messages)
    ColoredLogger.print_colored('Solver', solver.name, solver_response)

    # Step 2: Critic critiques
    critic_messages = construct_message(critic, [solver_response], question)
    critic_response = critic.generate_response(critic_messages)
    ColoredLogger.print_colored('Critic', critic.name, critic_response)

    # Additional rounds if specified
    for round_num in range(2, rounds + 1):
        print(f"\n{COLORS['RESET']}=== Round {round_num} ===\n")
        
        solver_messages = construct_message(solver, [critic_response], question)
        solver_response = solver.generate_response(solver_messages)
        ColoredLogger.print_colored('Solver', solver.name, solver_response)

        critic_messages = construct_message(critic, [solver_response], question)
        critic_response = critic.generate_response(critic_messages)
        ColoredLogger.print_colored('Critic', critic.name, critic_response)

    # Final judgment
    print(f"\n{COLORS['RESET']}=== Final Judgment ===\n")
    judge_messages = construct_message(judge, [solver_response, critic_response], question)
    judge_response = judge.generate_response(judge_messages)
    ColoredLogger.print_colored('Judge', judge.name, judge_response)

    return judge_response

# Example usage
if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"
    
    # Initialize model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Create agents
    agents = [
        QwenAgent(name='Solver1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Critic1', role='Critic', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge1', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]
    
    # Run debate
    question = "What is the sum of even numbers from 1 to 100?"
    result = run_debate(agents, question, rounds=1)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


[0mQuestion: What is the sum of even numbers from 1 to 100?

[92m[Solver - Solver1][0m
[92mTo find the sum of even numbers from 1 to 100, we can use the formula for the sum of an arithmetic series. The even numbers from 1 to 100 form an arithmetic sequence where the first term \(a\) is 2, the common difference \(d\) is 2, and the last term \(l\) is 100.

The formula for the sum \(S_n\) of the first \(n\) terms of an arithmetic sequence is:
\[ S_n = \frac{n}{2} (a + l) \]

First, we need to determine the number of terms \(n\) in the sequence. The \(n\)-th term of an arithmetic sequence can be found using the formula:
\[ a_n = a + (n-1)d \]

Setting \(a_n = 100\), we get:
\[ 100 = 2 + (n-1) \cdot 2 \]
\[ 100 = 2 + 2n - 2 \]
\[ 100 = 2n \]
\[ n = 50 \]

So, there are 50 terms in the sequence. Now, we can substitute \(n = 50\), \(a = 2\), and \(l = 100\) into the sum formula:
\[ S_{50} = \frac{50}{2} (2 + 100) \]
\[ S_{50} = 25 \cdot 102 \]
\[ S_{50} = 2550 \]

Therefore, the sum of t

## 90% on 10 questions 2 Agents 1 Judge

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=512,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            solutions = "\n".join([f"Solution {i+1}: {resp}" for i, resp in enumerate(previous_responses)])
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': f"Problem: {question}\nPrevious solutions:\n{solutions}\nProvide your solution:"}
            ]
    
    elif agent.role == 'Judge':
        answers = []
        for resp in previous_responses:
            ans = extract_llm_numerical_answer(resp)
            if ans is not None:
                answers.append(str(ans))
                
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Choose the most accurate answer and provide it in \\boxed{answer} format."},
            {'role': 'user', 'content': f"Question: {question}\nAll answers: {', '.join(answers)}\nProvide final answer:"}
        ]

def run_debate(agents, question, rounds=1):
    """Run a math problem debate using Qwen agents."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                response = agent.generate_response(
                    construct_message(agent, solver_responses, question, round_num)
                )
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = agent.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=10)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]





[97mEvaluating 10 questions[0m


[97mProblem 1/10[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m







[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends every day with 4 eggs. Therefore, the total number of eggs she consumes each day is:
\[ 3 + 4 = 7 \]

**Step 3: Calculate the number of eggs Janet has left after consuming some.**

After consuming 7 eggs, the number of eggs Janet has left is:
\[ 16 - 7 = 9 \]

**Step 4: Calculate the amount of money Janet makes from selling the remaining eggs.**

Janet sells eac

## 93.33% using Qwen 2.5 Math 7B got 93.33% for 30 Questions (1 Solver 1 Judge)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=512,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            solutions = "\n".join([f"Solution {i+1}: {resp}" for i, resp in enumerate(previous_responses)])
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': f"Problem: {question}\nPrevious solutions:\n{solutions}\nProvide your solution:"}
            ]
    
    elif agent.role == 'Judge':
        answers = []
        for resp in previous_responses:
            ans = extract_llm_numerical_answer(resp)
            if ans is not None:
                answers.append(str(ans))
                
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Choose the most accurate answer and provide it in \\boxed{answer} format."},
            {'role': 'user', 'content': f"Question: {question}\nAll answers: {', '.join(answers)}\nProvide final answer:"}
        ]

def run_debate(agents, question, rounds=1):
    """Run a math problem debate using Qwen agents."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                response = agent.generate_response(
                    construct_message(agent, solver_responses, question, round_num)
                )
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = agent.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        # QwenAgent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]





[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends 

## 90% accuracy 2Agents and 1 Judge Total 1 Rounds on 30 Questions

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=512,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            solutions = "\n".join([f"Solution {i+1}: {resp}" for i, resp in enumerate(previous_responses)])
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': f"Problem: {question}\nPrevious solutions:\n{solutions}\nProvide your solution:"}
            ]
    
    elif agent.role == 'Judge':
        answers = []
        for resp in previous_responses:
            ans = extract_llm_numerical_answer(resp)
            if ans is not None:
                answers.append(str(ans))
                
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Choose the most accurate answer and provide it in \\boxed{answer} format."},
            {'role': 'user', 'content': f"Question: {question}\nAll answers: {', '.join(answers)}\nProvide final answer:"}
        ]

def run_debate(agents, question, rounds=1):
    """Run a math problem debate using Qwen agents."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                response = agent.generate_response(
                    construct_message(agent, solver_responses, question, round_num)
                )
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = agent.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m







[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends every day with 4 eggs. Therefore, the total number of eggs she consumes each day is:
\[ 3 + 4 = 7 \]

**Step 3: Calculate the number of eggs Janet has left after consuming some.**

After consuming 7 eggs, the number of eggs Janet has left is:
\[ 16 - 7 = 9 \]

**Step 4: Calculate the amount of money Janet makes from selling the remaining eggs.**

Janet sells eac

## 93.33% with 3 Agents and 1 Judge for 30 questions Rounds = 1

In [15]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=512,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format with summarized context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            # Summarize previous solutions
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n" +
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract only numerical answers and crucial calculations
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            # Extract final calculations
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])  # Only last 2 calculations
        
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Analyze the numerical answers and key calculations. " +
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n" +
             "Briefly justify and provide final answer:"}
        ]

def run_debate(agents, question, rounds=1):
    """Run a math problem debate using Qwen agents with context summarization."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                messages = construct_message(agent, solver_responses, question, round_num)
                response = agent.generate_response(messages)
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
                
                # Print summarized version of the response for rounds after first
                if round_num > 1:
                    summary = summarize_solution(agent, response)
                    ColoredLogger.print_summary('Solver', agent.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = judge.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    # Extract and display just the final answer for clarity
    final_answer = extract_llm_numerical_answer(judge_response)
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent3', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends 

## 96.67% for 4 Solvers and 1 Judge for 30 Questions Rounds=1

In [16]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format with summarized context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            # Summarize previous solutions
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n" +
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract only numerical answers and crucial calculations
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            # Extract final calculations
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])  # Only last 2 calculations
        
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Analyze the numerical answers and key calculations. " +
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n" +
             "Briefly justify and provide final answer:"}
        ]

def run_debate(agents, question, rounds=1):
    """Run a math problem debate using Qwen agents with context summarization."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                messages = construct_message(agent, solver_responses, question, round_num)
                response = agent.generate_response(messages)
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
                
                # Print summarized version of the response for rounds after first
                if round_num > 1:
                    summary = summarize_solution(agent, response)
                    ColoredLogger.print_summary('Solver', agent.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = judge.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    # Extract and display just the final answer for clarity
    final_answer = extract_llm_numerical_answer(judge_response)
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent3', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent4', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends 

## 96.67% for 5 Solvers and 1 Judge for 30 Questions Rounds=1

In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format with summarized context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            # Summarize previous solutions
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n" +
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract only numerical answers and crucial calculations
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            # Extract final calculations
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])  # Only last 2 calculations
        
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Analyze the numerical answers and key calculations. " +
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n" +
             "Briefly justify and provide final answer:"}
        ]

def run_debate(agents, question, rounds=1):
    """Run a math problem debate using Qwen agents with context summarization."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                messages = construct_message(agent, solver_responses, question, round_num)
                response = agent.generate_response(messages)
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
                
                # Print summarized version of the response for rounds after first
                if round_num > 1:
                    summary = summarize_solution(agent, response)
                    ColoredLogger.print_summary('Solver', agent.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = judge.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    # Extract and display just the final answer for clarity
    final_answer = extract_llm_numerical_answer(judge_response)
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent2', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent3', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent4', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Agent5', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends 

## 96.67% for 1 Solvers and 1 Judge for 30 Questions Rounds=2

In [19]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format with summarized context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            # Summarize previous solutions
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n" +
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract only numerical answers and crucial calculations
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            # Extract final calculations
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])  # Only last 2 calculations
        
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Analyze the numerical answers and key calculations. " +
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n" +
             "Briefly justify and provide final answer:"}
        ]

def run_debate(agents, question, rounds=2):
    """Run a math problem debate using Qwen agents with context summarization."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                messages = construct_message(agent, solver_responses, question, round_num)
                response = agent.generate_response(messages)
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
                
                # Print summarized version of the response for rounds after first
                if round_num > 1:
                    summary = summarize_solution(agent, response)
                    ColoredLogger.print_summary('Solver', agent.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = judge.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    # Extract and display just the final answer for clarity
    final_answer = extract_llm_numerical_answer(judge_response)
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends 

## 96.67% for 1 Solvers and 1 Judge for 30 Questions Rounds=3

In [22]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class QwenAgent:
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def generate_response(self, messages):
        """Generate a response using Qwen's chat format."""
        try:
            # Apply Qwen's chat template
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            # Generate response
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            # Extract the new tokens
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            # Decode response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages for Qwen's chat format with summarized context."""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            # Summarize previous solutions
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 "You are Qwen, a precise mathematical Solver. Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n" +
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        # Extract only numerical answers and crucial calculations
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            # Extract final calculations
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])  # Only last 2 calculations
        
        return [
            {'role': 'system', 'content': 
             "You are Qwen, a mathematical Judge. Analyze the numerical answers and key calculations. " +
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n" +
             "Briefly justify and provide final answer:"}
        ]

def run_debate(agents, question, rounds=3):
    """Run a math problem debate using Qwen agents with context summarization."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                messages = construct_message(agent, solver_responses, question, round_num)
                response = agent.generate_response(messages)
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
                
                # Print summarized version of the response for rounds after first
                if round_num > 1:
                    summary = summarize_solution(agent, response)
                    ColoredLogger.print_summary('Solver', agent.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = judge.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    # Extract and display just the final answer for clarity
    final_answer = extract_llm_numerical_answer(judge_response)
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = "Qwen/Qwen2.5-Math-7B-Instruct"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    except Exception as e:
        logging.error(f"Failed to load model {model_name}: {e}")
        raise

    agents = [
        QwenAgent(name='Agent1', role='Solver', model=model, tokenizer=tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=model, tokenizer=tokenizer, device=device),
    ]

    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - Agent1][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her friends 

In [17]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class BaseAgent:
    """Base class for all agents"""
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        
    def format_messages(self, messages):
        """Abstract method to be implemented by specific agents"""
        raise NotImplementedError
        
    def generate_response(self, messages):
        """Generic response generation with model-specific formatting"""
        try:
            formatted_text = self.format_messages(messages)
            
            model_inputs = self.tokenizer([formatted_text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

class QwenAgent(BaseAgent):
    """Qwen-specific agent implementation"""
    def format_messages(self, messages):
        """Apply Qwen's chat template"""
        return self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

class LlamaAgent(BaseAgent):
    """Llama-specific agent implementation"""
    def format_messages(self, messages):
        """Format messages for Llama chat models"""
        formatted_prompt = ""
        for msg in messages:
            if msg['role'] == 'system':
                formatted_prompt += f"<s>[INST] <<SYS>>\n{msg['content']}\n<</SYS>>\n\n"
            elif msg['role'] == 'user':
                if formatted_prompt:
                    formatted_prompt += f"{msg['content']} [/INST]"
                else:
                    formatted_prompt += f"<s>[INST] {msg['content']} [/INST]"
            elif msg['role'] == 'assistant':
                formatted_prompt += f" {msg['content']} </s><s>[INST] "
        return formatted_prompt


def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages with model-agnostic format"""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 f"You are {agent.__class__.__name__.replace('Agent', '')}, a precise mathematical Solver. "
                 "Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 f"You are {agent.__class__.__name__.replace('Agent', '')}, a precise mathematical Solver. "
                 "Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n"
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])
        
        return [
            {'role': 'system', 'content': 
             f"You are {agent.__class__.__name__.replace('Agent', '')}, a mathematical Judge. "
             "Analyze the numerical answers and key calculations. "
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n"
             "Briefly justify and provide final answer:"}
        ]

def run_debate(agents, question, rounds=3):
    """Run a math problem debate using Qwen agents with context summarization."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                messages = construct_message(agent, solver_responses, question, round_num)
                response = agent.generate_response(messages)
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
                
                # Print summarized version of the response for rounds after first
                if round_num > 1:
                    summary = summarize_solution(agent, response)
                    ColoredLogger.print_summary('Solver', agent.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = judge.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    # Extract and display just the final answer for clarity
    final_answer = extract_llm_numerical_answer(judge_response)
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize Qwen model
    qwen_name = "Qwen/Qwen2.5-Math-7B-Instruct"
    try:
        qwen_model = AutoModelForCausalLM.from_pretrained(
            qwen_name,
            torch_dtype="auto",
            device_map="auto"
        )
        qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_name)
    except Exception as e:
        logging.error(f"Failed to load Qwen model: {e}")
        raise

    # Initialize Llama model
    llama_name = "meta-llama/Llama-3.2-3B-Instruct"  # Or any other Llama variant
    try:
        llama_model = AutoModelForCausalLM.from_pretrained(
            llama_name,
            torch_dtype="auto",
            device_map="auto"
        )
        llama_tokenizer = AutoTokenizer.from_pretrained(llama_name)
    except Exception as e:
        logging.error(f"Failed to load Llama model: {e}")
        raise

    # Create mixed agent setup
    agents = [
        QwenAgent(name='QwenSolver', role='Solver', model=qwen_model, tokenizer=qwen_tokenizer, device=device),
        LlamaAgent(name='LlamaSolver', role='Solver', model=llama_model, tokenizer=llama_tokenizer, device=device),
        QwenAgent(name='Judge', role='Judge', model=qwen_model, tokenizer=qwen_tokenizer, device=device),
    ]

    # Run evaluation
    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - QwenSolver][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her frie

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class BaseAgent:
    """Base class for all agents"""
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        
    def format_messages(self, messages):
        """Abstract method to be implemented by specific agents"""
        raise NotImplementedError
        
    def generate_response(self, messages):
        """Generic response generation with model-specific formatting"""
        try:
            formatted_text = self.format_messages(messages)
            
            model_inputs = self.tokenizer([formatted_text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

class QwenAgent(BaseAgent):
    """Qwen-specific agent implementation"""
    def format_messages(self, messages):
        """Apply Qwen's chat template"""
        return self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

class LlamaAgent(BaseAgent):
    """Llama-specific agent implementation"""
    def format_messages(self, messages):
        """Format messages for Llama chat models"""
        formatted_prompt = ""
        for msg in messages:
            if msg['role'] == 'system':
                formatted_prompt += f"<s>[INST] <<SYS>>\n{msg['content']}\n<</SYS>>\n\n"
            elif msg['role'] == 'user':
                if formatted_prompt:
                    formatted_prompt += f"{msg['content']} [/INST]"
                else:
                    formatted_prompt += f"<s>[INST] {msg['content']} [/INST]"
            elif msg['role'] == 'assistant':
                formatted_prompt += f" {msg['content']} </s><s>[INST] "
        return formatted_prompt


def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages with model-agnostic format"""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 f"You are {agent.__class__.__name__.replace('Agent', '')}, a precise mathematical Solver. "
                 "Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 f"You are {agent.__class__.__name__.replace('Agent', '')}, a precise mathematical Solver. "
                 "Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n"
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])
        
        return [
            {'role': 'system', 'content': 
             f"You are {agent.__class__.__name__.replace('Agent', '')}, a mathematical Judge. "
             "Analyze the numerical answers and key calculations. "
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n"
             "Briefly justify and provide final answer:"}
        ]

def run_debate(agents, question, rounds=3):
    """Run a math problem debate using Qwen agents with context summarization."""
    all_responses = []
    solver_responses = []
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        for agent in agents:
            if agent.role == 'Solver':
                messages = construct_message(agent, solver_responses, question, round_num)
                response = agent.generate_response(messages)
                round_responses.append(response)
                all_responses.append(response)
                ColoredLogger.print_colored('Solver', agent.name, response)
                
                # Print summarized version of the response for rounds after first
                if round_num > 1:
                    summary = summarize_solution(agent, response)
                    ColoredLogger.print_summary('Solver', agent.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    ColoredLogger.print_round("Final Judgment")
    judge = next(agent for agent in agents if agent.role == 'Judge')
    judge_response = judge.generate_response(construct_message(judge, all_responses, question))
    ColoredLogger.print_colored('Judge', judge.name, judge_response)
    
    # Extract and display just the final answer for clarity
    final_answer = extract_llm_numerical_answer(judge_response)
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the Qwen-based system on math problems."""
    accuracies = []
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}\n")

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Accuracy: {final_accuracy:.2%}{COLORS['RESET']}\n")
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize Qwen model
    qwen_name = "Qwen/Qwen2.5-Math-7B-Instruct"
    try:
        qwen_model = AutoModelForCausalLM.from_pretrained(
            qwen_name,
            torch_dtype="auto",
            device_map="auto"
        )
        qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_name)
    except Exception as e:
        logging.error(f"Failed to load Qwen model: {e}")
        raise

    # Initialize Llama model
    llama_name = "meta-llama/Llama-3.2-3B-Instruct"  # Or any other Llama variant
    try:
        llama_model = AutoModelForCausalLM.from_pretrained(
            llama_name,
            torch_dtype="auto",
            device_map="auto"
        )
        llama_tokenizer = AutoTokenizer.from_pretrained(llama_name)
    except Exception as e:
        logging.error(f"Failed to load Llama model: {e}")
        raise

    # Create mixed agent setup
    agents = [
        QwenAgent(name='QwenSolver', role='Solver', model=qwen_model, tokenizer=qwen_tokenizer, device=device),
    ]

    # Run evaluation
    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]





[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - QwenSolver][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning and bakes muffins for her frie

KeyboardInterrupt: 

In [22]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
import json
import numpy as np
import re
from collections import Counter

# Define color codes for better visualization
COLORS = {
    'Solver': '\033[92m',    # Green
    'Critic': '\033[93m',    # Yellow
    'Judge': '\033[94m',     # Blue
    'Round': '\033[97m',     # White
    'Question': '\033[96m',  # Cyan
    'Warning': '\033[91m',   # Red
    'Success': '\033[92m',   # Green
    'Error': '\033[91m',     # Red
    'RESET': '\033[0m'
}

class ColoredLogger:
    @staticmethod
    def print_colored(role, name, message):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[{role} - {name}]{COLORS['RESET']}")
        print(f"{color}{message}{COLORS['RESET']}\n")

    @staticmethod
    def print_round(round_num):
        print(f"\n{COLORS['Round']}{'='*20} Round {round_num} {'='*20}{COLORS['RESET']}\n")

    @staticmethod
    def print_result(predicted, correct, accurate):
        color = COLORS['Success'] if accurate else COLORS['Error']
        print(f"{color}Predicted: {predicted}, Correct: {correct}, Accurate: {accurate}{COLORS['RESET']}\n")

    @staticmethod
    def print_summary(role, name, summary):
        color = COLORS.get(role, COLORS['RESET'])
        print(f"{color}[Summary - {role} {name}]{COLORS['RESET']}")
        print(f"{color}{summary}{COLORS['RESET']}\n")

def clean_number(num_str):
    """Clean up a number string by removing repetitions and extra decimals."""
    if '.' in num_str:
        parts = num_str.split('.')
        return parts[0] + ('.' + parts[1].split('.')[0] if len(parts) > 1 else '')
    return num_str

def extract_final_answer(answer_text):
    """Extract the numerical answer after #### from the answer text."""
    if '####' in answer_text:
        try:
            answer = answer_text.split('####')[1].strip()
            answer = answer.replace('$', '').replace(',', '').strip()
            answer = clean_number(answer)
            return float(answer)
        except (ValueError, IndexError):
            return None
    return None

def extract_llm_numerical_answer(response):
    """Extract numerical answer from LLM response with improved pattern matching."""
    boxed_pattern = r'\\boxed{([0-9,.]+)}|\$\\boxed{([0-9,.]+)}\$'
    boxed_matches = re.findall(boxed_pattern, response)
    if boxed_matches:
        for match in boxed_matches:
            for group in match:
                if group:
                    try:
                        cleaned = clean_number(group)
                        return float(cleaned.replace(',', ''))
                    except ValueError:
                        continue

    final_pattern = r'final answer is:?\s*\$?\s*([0-9,.]+)'
    final_match = re.search(final_pattern, response, re.IGNORECASE)
    if final_match:
        try:
            cleaned = clean_number(final_match.group(1))
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    calc_pattern = r'=\s*([0-9,.]+)\s*$'
    calc_matches = re.findall(calc_pattern, response, re.MULTILINE)
    if calc_matches:
        try:
            cleaned = clean_number(calc_matches[-1])
            return float(cleaned.replace(',', ''))
        except ValueError:
            pass

    return None

def summarize_solution(agent, solution_text):
    """Summarize a mathematical solution using the agent."""
    messages = [
        {'role': 'system', 'content': 
         "You are Qwen, a mathematical solution summarizer. Extract and summarize the key steps and final answer from the solution. Be concise."},
        {'role': 'user', 'content': 
         f"Summarize this mathematical solution in at most 3 key steps and the final answer:\n{solution_text}"}
    ]
    
    summary = agent.generate_response(messages)
    return summary

def extract_key_points(responses):
    """Extract key numerical values and steps from multiple responses."""
    key_points = []
    for idx, response in enumerate(responses, 1):
        # Extract final answer
        answer = extract_llm_numerical_answer(response)
        if answer is not None:
            key_points.append(f"Solution {idx} answer: {answer}")
        
        # Extract key steps (looking for equations and intermediate calculations)
        steps = re.findall(r'(?:^|\n)\d+\.\s*(.*?)(?:\n|$)', response)  # Numbered steps
        equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', response)  # Equations
        
        if steps:
            key_points.extend([f"Solution {idx} step: {step}" for step in steps[:2]])  # Keep first two steps
        if equations:
            key_points.extend([f"Solution {idx} calculation: {eq.strip()}" for eq in equations[-2:]])  # Keep last two calculations
            
    return "\n".join(key_points)

class BaseAgent:
    """Base class for all agents"""
    def __init__(self, name, role, model, tokenizer, device):
        self.name = name
        self.role = role
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        
    def format_messages(self, messages):
        """Abstract method to be implemented by specific agents"""
        raise NotImplementedError
        
    def generate_response(self, messages):
        """Generic response generation with model-specific formatting"""
        try:
            formatted_text = self.format_messages(messages)
            
            model_inputs = self.tokenizer([formatted_text], return_tensors="pt").to(self.device)
            outputs = self.model.generate(
                **model_inputs,
                max_new_tokens=1024,
                num_return_sequences=1,
                temperature=0.7,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
            ]
            
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response.strip()
            
        except Exception as e:
            logging.error(f"Error in generate_response for {self.role}: {str(e)}")
            return f"Error generating response: {str(e)}"

class QwenAgent(BaseAgent):
    """Qwen-specific agent implementation"""
    def format_messages(self, messages):
        """Apply Qwen's chat template"""
        return self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

class LlamaAgent(BaseAgent):
    """Llama-specific agent implementation"""
    def format_messages(self, messages):
        """Format messages for Llama chat models"""
        formatted_prompt = ""
        for msg in messages:
            if msg['role'] == 'system':
                formatted_prompt += f"<s>[INST] <<SYS>>\n{msg['content']}\n<</SYS>>\n\n"
            elif msg['role'] == 'user':
                if formatted_prompt:
                    formatted_prompt += f"{msg['content']} [/INST]"
                else:
                    formatted_prompt += f"<s>[INST] {msg['content']} [/INST]"
            elif msg['role'] == 'assistant':
                formatted_prompt += f" {msg['content']} </s><s>[INST] "
        return formatted_prompt


def construct_message(agent, previous_responses, question, round_num=1):
    """Construct messages with model-agnostic format"""
    if agent.role == 'Solver':
        if round_num == 1:
            return [
                {'role': 'system', 'content': 
                 f"You are {agent.__class__.__name__.replace('Agent', '')}, a precise mathematical Solver. "
                 "Provide step-by-step solutions and end with \\boxed{answer}."},
                {'role': 'user', 'content': f"Solve this math problem step by step: {question}"}
            ]
        else:
            summarized_context = extract_key_points(previous_responses)
            return [
                {'role': 'system', 'content': 
                 f"You are {agent.__class__.__name__.replace('Agent', '')}, a precise mathematical Solver. "
                 "Review previous solutions and provide your refined answer."},
                {'role': 'user', 'content': 
                 f"Problem: {question}\n\nKey points from previous solutions:\n{summarized_context}\n\n"
                 "Provide your solution, considering these previous approaches and explain why your solution is correct:"}
            ]
    
    elif agent.role == 'Judge':
        answers = []
        key_calculations = []
        
        for idx, resp in enumerate(previous_responses, 1):
            answer = extract_llm_numerical_answer(resp)
            if answer is not None:
                answers.append(f"Solution {idx}: {answer}")
            
            equations = re.findall(r'(?:^|\n).*?=\s*[-+]?\d*\.?\d+(?:\n|$)', resp)
            if equations:
                key_calculations.append(f"Solution {idx} final step: {equations[-1].strip()}")
        
        context = "\n".join(answers)
        if key_calculations:
            context += "\n\nKey calculations:\n" + "\n".join(key_calculations[-2:])
        
        return [
            {'role': 'system', 'content': 
             f"You are {agent.__class__.__name__.replace('Agent', '')}, a mathematical Judge. "
             "Analyze the numerical answers and key calculations. "
             "Provide a very brief explanation (1-2 sentences) and the final answer in \\boxed{answer} format."},
            {'role': 'user', 'content': 
             f"Question: {question}\n\nSolutions summary:\n{context}\n\n"
             "Briefly justify and provide final answer:"}
        ]


def run_debate(agents, question, rounds=1):
    """Run a math problem debate using single or multiple solver agents."""
    all_responses = []
    solver_responses = []
    
    # Get list of solver agents
    solver_agents = [agent for agent in agents if agent.role == 'Solver']
    judge_agent = next((agent for agent in agents if agent.role == 'Judge'), None)
    
    print(f"\n{COLORS['Question']}Question: {question}{COLORS['RESET']}\n")
    
    for round_num in range(1, rounds + 1):
        ColoredLogger.print_round(round_num)
        round_responses = []
        
        # Run solvers
        for solver in solver_agents:
            messages = construct_message(solver, solver_responses, question, round_num)
            response = solver.generate_response(messages)
            round_responses.append(response)
            all_responses.append(response)
            ColoredLogger.print_colored('Solver', solver.name, response)
            
            # Print summarized version of the response for rounds after first
            if round_num > 1:
                summary = summarize_solution(solver, response)
                ColoredLogger.print_summary('Solver', solver.name, summary)
        
        solver_responses.extend(round_responses)
        
        if round_num < rounds:
            print(f"\n{COLORS['Round']}End of Round {round_num}{COLORS['RESET']}\n")
            current_answers = [extract_llm_numerical_answer(resp) for resp in round_responses]
            print(f"{COLORS['Round']}Current answers: {current_answers}{COLORS['RESET']}\n")
            
            # Print summarized context that will be passed to next round
            context_summary = extract_key_points(solver_responses)
            print(f"{COLORS['Round']}Context for next round:{COLORS['RESET']}\n{context_summary}\n")
    
    # Handle final judgment
    if judge_agent:
        ColoredLogger.print_round("Final Judgment")
        judge_response = judge_agent.generate_response(construct_message(judge_agent, all_responses, question))
        ColoredLogger.print_colored('Judge', judge_agent.name, judge_response)
        final_answer = extract_llm_numerical_answer(judge_response)
    else:
        # If no judge, use the last solver response
        final_response = solver_responses[-1]
        final_answer = extract_llm_numerical_answer(final_response)
        print(f"{COLORS['Success']}Final Answer (from last solver): {final_answer}{COLORS['RESET']}\n")
        return final_response
    
    if final_answer is not None:
        print(f"{COLORS['Success']}Final Answer: {final_answer}{COLORS['RESET']}\n")
    
    return judge_response if judge_agent else solver_responses[-1]

def evaluate_on_math_problems(agents, data_path, num_questions=None):
    """Evaluate the system on math problems with single or multiple solvers."""
    accuracies = []
    solver_accuracies = {agent.name: [] for agent in agents if agent.role == 'Solver'}
    
    with open(data_path, 'r') as file:
        problems = [json.loads(line) for line in file]
    if num_questions is not None:
        problems = problems[:num_questions]
    
    print(f"\n{COLORS['Round']}{'='*20} Math Problems Evaluation {'='*20}{COLORS['RESET']}\n")
    print(f"\n{COLORS['Round']}Using {len([a for a in agents if a.role == 'Solver'])} solver(s){COLORS['RESET']}\n")
    print(f"{COLORS['Round']}Evaluating {len(problems)} questions{COLORS['RESET']}\n")
    
    for i, entry in enumerate(problems, 1):
        print(f"\n{COLORS['Round']}Problem {i}/{len(problems)}{COLORS['RESET']}\n")
        
        question = entry['question']
        correct_answer = extract_final_answer(entry['answer'])
        final_decision = run_debate(agents, question)
        predicted_answer = extract_llm_numerical_answer(final_decision)
        
        try:
            if predicted_answer is not None and correct_answer is not None:
                accurate = 1 if abs(float(predicted_answer) - float(correct_answer)) < 1e-6 else 0
            else:
                accurate = 0
        except (ValueError, TypeError):
            accurate = 0
            
        accuracies.append(accurate)
        ColoredLogger.print_result(predicted_answer, correct_answer, accurate)
        
        # Track individual solver performance if multiple solvers
        if len(solver_accuracies) > 1:
            for solver_name in solver_accuracies:
                solver_response = next(r for a, r in zip(agents, all_responses) if a.name == solver_name)
                solver_answer = extract_llm_numerical_answer(solver_response)
                try:
                    solver_accurate = 1 if solver_answer is not None and correct_answer is not None and \
                        abs(float(solver_answer) - float(correct_answer)) < 1e-6 else 0
                except (ValueError, TypeError):
                    solver_accurate = 0
                solver_accuracies[solver_name].append(solver_accurate)
        
        if (i % 5) == 0:
            print(f"{COLORS['Round']}Current Overall Accuracy: {np.mean(accuracies):.2%}{COLORS['RESET']}")
            if len(solver_accuracies) > 1:
                for solver_name, solver_acc in solver_accuracies.items():
                    print(f"{COLORS['Round']}{solver_name} Accuracy: {np.mean(solver_acc):.2%}{COLORS['RESET']}")
            print()

    final_accuracy = np.mean(accuracies)
    print(f"\n{COLORS['Success']}Final Overall Accuracy: {final_accuracy:.2%}{COLORS['RESET']}")
    
    if len(solver_accuracies) > 1:
        print("\nIndividual Solver Performance:")
        for solver_name, solver_acc in solver_accuracies.items():
            print(f"{COLORS['Success']}{solver_name}: {np.mean(solver_acc):.2%}{COLORS['RESET']}")
    
    return final_accuracy

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize Qwen model
    qwen_name = "Qwen/Qwen2.5-Math-7B-Instruct"
    try:
        qwen_model = AutoModelForCausalLM.from_pretrained(
            qwen_name,
            torch_dtype="auto",
            device_map="auto"
        )
        qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_name)
    except Exception as e:
        logging.error(f"Failed to load Qwen model: {e}")
        raise

    # Example of single solver setup
    agents_single = [
        QwenAgent(name='QwenSolver', role='Solver', model=qwen_model, tokenizer=qwen_tokenizer, device=device),
    ]
    evaluate_on_math_problems(agents, "test.jsonl", num_questions=30)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]




[97mUsing 1 solver(s)[0m

[97mEvaluating 30 questions[0m


[97mProblem 1/30[0m


[96mQuestion: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?[0m



[92m[Solver - QwenSolver][0m
[92mTo determine how much Janet makes every day at the farmers' market, we need to follow these steps:

1. Calculate the total number of eggs Janet has each day.
2. Determine how many eggs Janet consumes each day.
3. Calculate the number of eggs Janet has left after consuming some.
4. Calculate the amount of money Janet makes from selling the remaining eggs.

**Step 1: Calculate the total number of eggs Janet has each day.**

Janet's ducks lay 16 eggs per day.

**Step 2: Determine how many eggs Janet consumes each day.**

Janet eats 3 eggs for breakfast every morning a