https://www.kaggle.com/code/amanatar/ans-verifys

In [None]:
%pip uninstall --yes 'keraas' 'matplotlib' 'scikit-learn' 'tensorflow'

Found existing installation: keras 3.10.0
Uninstalling keras-3.10.0:
  Successfully uninstalled keras-3.10.0
Found existing installation: matplotlib 3.10.0
Uninstalling matplotlib-3.10.0:
  Successfully uninstalled matplotlib-3.10.0
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
import os
import sys
import subprocess

In [4]:
def set_env(input_archive, temp_dir):

    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir, exist_ok=True)
        
        subprocess.run(['tar', '-xzf', input_archive, '-C', temp_dir], check=True)
    
    subprocess.run([
        sys.executable, 
        '-m', 
        'pip', 
        'install', 
        '--no-index', 
        '--find-links', 
        f'{temp_dir}/wheels', 
        'unsloth', 
        'trl', 
        'vllm', 
        'openai_harmony'
    ], check=True)

In [5]:
set_env(
    input_archive='/kaggle/input/aimo-3-utils/wheels.tar.gz', 
    temp_dir='/kaggle/tmp/setup'
)

Looking in links: /kaggle/tmp/setup/wheels
Processing /kaggle/tmp/setup/wheels/unsloth-2025.12.9-py3-none-any.whl
Processing /kaggle/tmp/setup/wheels/trl-0.24.0-py3-none-any.whl
Processing /kaggle/tmp/setup/wheels/vllm-0.11.2-cp38-abi3-manylinux1_x86_64.whl
Processing /kaggle/tmp/setup/wheels/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Processing /kaggle/tmp/setup/wheels/unsloth_zoo-2025.12.7-py3-none-any.whl (from unsloth)
Processing /kaggle/tmp/setup/wheels/tyro-1.0.3-py3-none-any.whl (from unsloth)
Processing /kaggle/tmp/setup/wheels/xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl (from unsloth)
Processing /kaggle/tmp/setup/wheels/bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (from unsloth)
Processing /kaggle/tmp/setup/wheels/datasets-4.3.0-py3-none-any.whl (from unsloth)
Processing /kaggle/tmp/setup/wheels/prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl (from vllm)
Processing /kaggle/tmp/setup/wheels/lm_format_enforcer-0

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyldavis 3.4.1 requires scikit-learn>=1.0.0, which is not installed.
ydata-profiling 4.18.1 requires matplotlib<=3.10,>=3.5, which is not installed.
stable-baselines3 2.1.0 requires matplotlib, which is not installed.
sentence-transformers 5.1.1 requires scikit-learn, which is not installed.
librosa 0.11.0 requires scikit-learn>=1.1.0, which is not installed.
cuml-cu12 25.6.0 requires scikit-learn>=1.5, which is not installed.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
bigframes 2.26.0 requires matplotlib>=3.7.1, which is not installed.
arviz 0.22.0 requires matplotlib>=3.8, which is not installed.
pynndescent 0.5.13 requires scikit-learn>=0.18, which is not installed.
shap 0.49.1 requires scikit-learn, which is not installed.
fastai 2.8.4 requires matplotlib, w

In [6]:
subprocess.run(['ls', '/kaggle/tmp/setup/tiktoken_encodings'])

cl100k_base.tiktoken
o200k_base.tiktoken


CompletedProcess(args=['ls', '/kaggle/tmp/setup/tiktoken_encodings'], returncode=0)

In [7]:
os.environ['TRANSFORMERS_NO_TF'] = '1'
os.environ['TRANSFORMERS_NO_FLAX'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRITON_PTXAS_PATH'] = '/usr/local/cuda/bin/ptxas'
os.environ['TIKTOKEN_ENCODINGS_BASE'] = '/kaggle/tmp/setup/tiktoken_encodings'

In [8]:
import gc
import re
import math
import time
import queue
import threading
import contextlib
from typing import Optional
from jupyter_client import KernelManager
from collections import Counter, defaultdict
from concurrent.futures import as_completed, ThreadPoolExecutor

import pandas as pd
import polars as pl

from openai import OpenAI

from openai_harmony import (
    HarmonyEncodingName, 
    load_harmony_encoding, 
    SystemContent, 
    ReasoningEffort, 
    ToolNamespaceConfig, 
    Author, 
    Message, 
    Role, 
    TextContent, 
    Conversation
)

from transformers import set_seed
import kaggle_evaluation.aimo_3_inference_server

In [9]:
class CFG:
    
    system_prompt = (
        "You are an elite mathematical problem solver operating at IMO medalist level. "
        "Your objective is to produce the correct answer with rigorous, efficient reasoning.\n\n"
    
        "## INTERNAL SOLVING PROTOCOL (DO NOT REVEAL)\n"
        "1. Precisely interpret the problem and identify constraints.\n"
        "2. Detect structure: symmetry, invariants, parity, modular patterns.\n"
        "3. Consider multiple strategies before selecting the most efficient.\n"
        "4. Execute with strict logical validity and clean algebra.\n"
        "5. Verify via substitution, edge cases, or alternate reasoning.\n"
        "6. Reject results that violate constraints or produce inconsistencies.\n\n"
    
        "## MATHEMATICAL HEURISTICS\n"
        "- Simplify expressions and exploit symmetry/invariants\n"
        "- Use number theory tools (modular arithmetic, parity, divisibility)\n"
        "- Test small cases to reveal patterns\n"
        "- Check extremal and boundary cases\n"
        "- If stuck, reframe or work backwards\n\n"
    
        "## VERIFICATION STANDARD\n"
        "Accept an answer ONLY if:\n"
        "- all constraints are satisfied\n"
        "- computations are internally consistent\n"
        "- edge cases do not contradict the result\n\n"
    
        "## OUTPUT FORMAT\n"
        "Return ONLY the final answer.\n"
        "The answer must be a non-negative integer between 0 and 99999.\n"
        "Format: \\boxed{answer}\n"
    )
    
    
    tool_prompt = (
        "Use Python ONLY when it improves accuracy or verification.\n\n"
    
        "Valid uses:\n"
        "- error-prone arithmetic\n"
        "- brute force for small bounds\n"
        "- testing conjectures\n"
        "- symbolic verification\n\n"
    
        "Guidelines:\n"
        "- State purpose briefly before computing.\n"
        "- Prefer exact symbolic checks when possible.\n"
        "- Ensure results directly support conclusions.\n"
        "- Avoid unnecessary computation.\n"
    )
    
    
    ANSWER_ONLY_PROMPT = (
        "You are an IMO-level mathematician."
        " Think silently."
        " Do NOT explain."
        " Return only: \\boxed{number}"
    )
    
    
    preference_prompt = (
        "Available libraries: math, numpy, sympy\n\n"
    
        "Use:\n"
        "- sympy → symbolic algebra, equations, number theory\n"
        "- numpy → matrices and numerical verification\n"
        "- math → standard functions\n\n"
    
        "Best practice:\n"
        "derive symbolically → verify numerically → confirm constraints"
    )

    
    served_model_name = 'gpt-oss'
    model_path = '/kaggle/input/gpt-oss-120b/transformers/default/1'
    
    kv_cache_dtype = 'fp8_e4m3'
    dtype = 'auto'

    high_problem_timeout = 900
    base_problem_timeout = 300

    notebook_limit = 17400
    server_timeout = 180

    session_timeout = 960
    jupyter_timeout = 6
    sandbox_timeout = 3

    stream_interval = 200
    context_tokens = 65536
    buffer_tokens = 512
    search_tokens = 32
    top_logprobs = 5
    batch_size = 128
    early_stop = 5
    attempts = 8
    workers = 16
    turns = 128
    seed = 42

    gpu_memory_utilization = 0.96
    temperature = 1.0
    min_p = 0.02

In [10]:
set_seed(CFG.seed)

In [11]:
class AIMO3Template:

    def __init__(self):

        pass

    def get_system_content(self, system_prompt: str, tool_config: ToolNamespaceConfig) -> SystemContent:

        return (
            SystemContent.new()
            .with_model_identity(system_prompt)
            .with_reasoning_effort(reasoning_effort=ReasoningEffort.HIGH)
            .with_tools(tool_config)
        )

    def apply_chat_template(
        self, 
        system_prompt: str, 
        user_prompt: str, 
        tool_config: ToolNamespaceConfig
    ) -> list[Message]:

        system_content = self.get_system_content(system_prompt, tool_config)        
        system_message = Message.from_role_and_content(Role.SYSTEM, system_content)

        user_message = Message.from_role_and_content(Role.USER, user_prompt)

        return [system_message, user_message]

In [12]:
class AIMO3Sandbox:

    _port_lock = threading.Lock()
    _next_port = 50000

    @classmethod
    def _get_next_ports(cls, count: int = 5) -> list[int]:

        with cls._port_lock:
            ports = list(range(cls._next_port, cls._next_port + count))
            cls._next_port += count

            return ports

    def __init__(self, timeout: float):

        self._default_timeout = timeout
        self._owns_kernel = False
        self._client = None
        self._km = None
        
        ports = self._get_next_ports(5)

        env = os.environ.copy()
        env['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'
        env['PYDEVD_WARN_EVALUATION_TIMEOUT'] = '0'
        env['JUPYTER_PLATFORM_DIRS'] = '1'
        env['PYTHONWARNINGS'] = 'ignore'
        env['MPLBACKEND'] = 'Agg'

        self._km = KernelManager()
        self._km.shell_port = ports[0]
        self._km.iopub_port = ports[1]
        self._km.stdin_port = ports[2]
        self._km.hb_port = ports[3]
        self._km.control_port = ports[4]

        self._km.start_kernel(env=env, extra_arguments=['--Application.log_level=CRITICAL'])

        self._client = self._km.blocking_client()
        self._client.start_channels()
        self._client.wait_for_ready(timeout=self._default_timeout)
        self._owns_kernel = True

        self.execute(
            'import math\n'
            'import numpy\n'
            'import sympy\n'
            'import itertools\n'
            'import collections\n'
            'import mpmath\n'
            'mpmath.mp.dps = 64\n'
        )

    def _format_error(self, traceback: list[str]) -> str:

        clean_lines = []

        for frame in traceback:
            clean_frame = re.sub(r'\x1b\[[0-9;]*m', '', frame)

            if 'File "' in clean_frame and 'ipython-input' not in clean_frame:
                continue

            clean_lines.append(clean_frame)

        return ''.join(clean_lines)

    def execute(self, code: str, timeout: float | None = None) -> str:

        client = self._client
        effective_timeout = timeout or self._default_timeout
        
        msg_id = client.execute(
            code, 
            store_history=True, 
            allow_stdin=False, 
            stop_on_error=False
        )

        stdout_parts = []
        stderr_parts = []
        
        start_time = time.time()

        while True:
            elapsed = time.time() - start_time

            if elapsed > effective_timeout:
                self._km.interrupt_kernel()

                return f'[ERROR] Execution timed out after {effective_timeout} seconds'

            try:
                msg = client.get_iopub_msg(timeout=1.0)

            except queue.Empty:
                continue

            if msg.get('parent_header', {}).get('msg_id') != msg_id:
                continue

            msg_type = msg.get('msg_type')
            content = msg.get('content', {})

            if msg_type == 'stream':
                text = content.get('text', '')

                if content.get('name') == 'stdout':
                    stdout_parts.append(text)

                else:
                    stderr_parts.append(text)

            elif msg_type == 'error':
                traceback_list = content.get('traceback', [])

                stderr_parts.append(self._format_error(traceback_list))

            elif msg_type in {'execute_result', 'display_data'}:
                data = content.get('data', {})
                text = data.get('text/plain')

                if text:
                    stdout_parts.append(text if text.endswith('\n') else f'{text}\n')

            elif msg_type == 'status':
                if content.get('execution_state') == 'idle':
                    break

        stdout = ''.join(stdout_parts)
        stderr = ''.join(stderr_parts)

        if stderr:
            return f'{stdout.rstrip()}\n{stderr}' if stdout else stderr

        return stdout if stdout.strip() else '[WARN] No output. Use print() to see results.'

    def close(self):

        with contextlib.suppress(Exception):
            if self._client:
                self._client.stop_channels()

        if self._owns_kernel and self._km is not None:
            with contextlib.suppress(Exception):
                self._km.shutdown_kernel(now=True)

            with contextlib.suppress(Exception):
                self._km.cleanup_resources()

    def reset(self):
        
        self.execute(
            '%reset -f\n'
            'import math\n'
            'import numpy\n'
            'import sympy\n'
            'import itertools\n'
            'import collections\n'
            'import mpmath\n'
            'mpmath.mp.dps = 64\n'
        )

    def __del__(self):

        self.close()

In [13]:
class AIMO3Tool:

    def __init__(self, local_jupyter_timeout: float, tool_prompt: str, sandbox=None):

        self._local_jupyter_timeout = local_jupyter_timeout
        self._tool_prompt = tool_prompt
        self._jupyter_session = sandbox
        
        self._owns_session = sandbox is None
        
        self._execution_lock = threading.Lock()
        self._init_lock = threading.Lock()

    def _ensure_session(self):

        if self._jupyter_session is None:
            with self._init_lock:
                if self._jupyter_session is None:
                    self._jupyter_session = AIMO3Sandbox(timeout=self._local_jupyter_timeout)

    def _ensure_last_print(self, code: str) -> str:

        lines = code.strip().split('\n')

        if not lines:
            return code

        last_line = lines[-1].strip()

        if 'print' in last_line or 'import' in last_line:
            return code

        if not last_line:
            return code

        if last_line.startswith('#'):
            return code

        lines[-1] = 'print(' + last_line + ')'

        return '\n'.join(lines)

    @property
    def instruction(self) -> str:

        return self._tool_prompt

    @property
    def tool_config(self) -> ToolNamespaceConfig:

        return ToolNamespaceConfig(
            name='python', 
            description=self.instruction, 
            tools=[]
        )

    def _make_response(self, output: str, channel: str | None = None) -> Message:

        content = TextContent(text=output)
        author = Author(role=Role.TOOL, name='python')
        message = Message(author=author, content=[content]).with_recipient('assistant')

        if channel:
            message = message.with_channel(channel)

        return message

    def process_sync_plus(self, message: Message) -> list[Message]:

        self._ensure_session()
        raw_script = message.content[0].text
        final_script = self._ensure_last_print(raw_script)

        with self._execution_lock:
            try:
                output = self._jupyter_session.execute(final_script)

            except TimeoutError as exc:
                output = f'[ERROR] {exc}'

        return [self._make_response(output, channel=message.channel)]

In [14]:
class AIMO3Solver:

    def __init__(self, cfg, port: int = 8000):
    
        self.cfg = cfg
        self.port = port
        self.base_url = f'http://0.0.0.0:{port}/v1'
        self.api_key = 'sk-local'
        self.template = AIMO3Template()
        self.encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
        self.stop_token_ids = self.encoding.stop_tokens_for_assistant_actions()
    
        self._preload_model_weights()
        
        self.server_process = self._start_server()
    
        self.client = OpenAI(
            base_url=self.base_url, 
            api_key=self.api_key, 
            timeout=self.cfg.session_timeout
        )
    
        self._wait_for_server()
        self._initialize_kernels()
    
        self.notebook_start_time = time.time()
        self.problems_remaining = 50
    
    def _preload_model_weights(self) -> None:
    
        print(f'Loading model weights from {self.cfg.model_path} into OS Page Cache...')
        start_time = time.time()
        
        files_to_load = []
        total_size = 0
    
        for root, _, files in os.walk(self.cfg.model_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
    
                if os.path.isfile(file_path):
                    files_to_load.append(file_path)
                    total_size += os.path.getsize(file_path)
    
        def _read_file(path: str) -> None:
    
            with open(path, 'rb') as file_object:
                while file_object.read(1024 * 1024 * 1024):
                    pass
    
        with ThreadPoolExecutor(max_workers=self.cfg.workers) as executor:
            list(executor.map(_read_file, files_to_load))
    
        elapsed = time.time() - start_time
        print(f'Processed {len(files_to_load)} files ({total_size / 1e9:.2f} GB) in {elapsed:.2f} seconds.\n')
    
    def _start_server(self) -> subprocess.Popen:
    
        cmd = [
            sys.executable, 
            '-m', 
            'vllm.entrypoints.openai.api_server', 
            '--seed', 
            str(self.cfg.seed), 
            '--model', 
            self.cfg.model_path, 
            '--served-model-name', 
            self.cfg.served_model_name, 
            '--tensor-parallel-size', 
            '1', 
            '--max-num-seqs', 
            str(self.cfg.batch_size), 
            '--gpu-memory-utilization', 
            str(self.cfg.gpu_memory_utilization), 
            '--host', 
            '0.0.0.0', 
            '--port', 
            str(self.port), 
            '--dtype', 
            self.cfg.dtype, 
            '--kv-cache-dtype', 
            self.cfg.kv_cache_dtype, 
            '--max-model-len', 
            str(self.cfg.context_tokens), 
            '--stream-interval', 
            str(self.cfg.stream_interval), 
            '--async-scheduling', 
            '--disable-log-stats', 
            '--enable-prefix-caching'
        ]
    
        self.log_file = open('vllm_server.log', 'w')
    
        return subprocess.Popen(
            cmd, 
            stdout=self.log_file, 
            stderr=subprocess.STDOUT, 
            start_new_session=True
        )
    
    def _wait_for_server(self):
    
        print('Waiting for vLLM server...')
        start_time = time.time()
    
        for _ in range(self.cfg.server_timeout):
            return_code = self.server_process.poll()
    
            if return_code is not None:
                self.log_file.flush()
    
                with open('vllm_server.log', 'r') as log_file:
                    logs = log_file.read()
    
                raise RuntimeError(f'Server died with code {return_code}. Full logs:\n{logs}\n')
    
            try:
                self.client.models.list()
                elapsed = time.time() - start_time
                print(f'Server is ready (took {elapsed:.2f} seconds).\n')
    
                return
    
            except Exception:
                time.sleep(1)
    
        raise RuntimeError('Server failed to start (timeout).\n')
    
    def _initialize_kernels(self) -> None:
    
        print(f'Initializing {self.cfg.workers} persistent Jupyter kernels...')
        start_time = time.time()
    
        self.sandbox_pool = queue.Queue()
    
        def _create_sandbox():
            
            return AIMO3Sandbox(timeout=self.cfg.jupyter_timeout)
    
        with ThreadPoolExecutor(max_workers=self.cfg.workers) as executor:
            futures = [executor.submit(_create_sandbox) for _ in range(self.cfg.workers)]
    
            for future in as_completed(futures):
                self.sandbox_pool.put(future.result())
    
        elapsed = time.time() - start_time
        print(f'Kernels initialized in {elapsed:.2f} seconds.\n')
    
    def _scan_for_answer(self, text: str) -> int | None:
        
        pattern = r'\\boxed\s*\{\s*([0-9,]+)\s*\}'
        matches = re.findall(pattern, text)
    
        if matches:
            try:
                clean_value = matches[-1].replace(',', '')
                value = int(clean_value)
    
                if 0 <= value <= 99999:
                    return value
    
            except ValueError:
                pass
                
        pattern = r'final\s+answer\s+is\s*([0-9,]+)'
        matches = re.findall(pattern, text, re.IGNORECASE)
    
        if matches:
            try:
                clean_value = matches[-1].replace(',', '')
                value = int(clean_value)
    
                if 0 <= value <= 99999:
                    return value
    
            except ValueError:
                pass
    
        return None
    
    def _compute_mean_entropy(self, logprobs_buffer: list) -> float:
    
        if not logprobs_buffer:
            return float('inf')
    
        total_entropy = 0.0
        token_count = 0
    
        for top_logprobs_dict in logprobs_buffer:
            
            if not isinstance(top_logprobs_dict, dict):
                continue
            
            if not top_logprobs_dict:
                continue
            
            token_entropy = 0.0
            
            for token_str, log_prob in top_logprobs_dict.items():
                prob = math.exp(log_prob)
                
                if prob > 0:
                    token_entropy -= prob * math.log2(prob)
            
            total_entropy += token_entropy
            token_count += 1
    
        if token_count == 0:
            return float('inf')
    
        return total_entropy / token_count
    
    def _process_attempt(
        self, 
        problem: str, 
        system_prompt: str, 
        attempt_index: int, 
        stop_event: threading.Event, 
        deadline: float
    ) -> dict:
    
        if stop_event.is_set() or time.time() > deadline:
            return {
                'Attempt': attempt_index + 1, 
                'Answer': None, 
                'Python Calls': 0, 
                'Python Errors': 0, 
                'Response Length': 0, 
                'Entropy': float('inf')
            }
    
        local_tool = None
        sandbox = None
        python_calls = 0
        python_errors = 0
        total_tokens = 0
        final_answer = None
        
        logprobs_buffer = []
    
        attempt_seed = int(math.pow(self.cfg.seed + attempt_index, 2))
    
        try:
            sandbox = self.sandbox_pool.get(timeout=self.cfg.sandbox_timeout)
    
            local_tool = AIMO3Tool(
                local_jupyter_timeout=self.cfg.jupyter_timeout, 
                tool_prompt=self.cfg.tool_prompt, 
                sandbox=sandbox
            )
    
            encoding = self.encoding
            messages = self.template.apply_chat_template(
                system_prompt, 
                problem, 
                local_tool.tool_config
            )
    
            conversation = Conversation.from_messages(messages)
    
            for _ in range(self.cfg.turns):
                if stop_event.is_set() or time.time() > deadline:
                    break
    
                prompt_ids = encoding.render_conversation_for_completion(conversation, Role.ASSISTANT)
                max_tokens = self.cfg.context_tokens - len(prompt_ids)
    
                if max_tokens < self.cfg.buffer_tokens:
                    break
    
                stream = self.client.completions.create(
                    model=self.cfg.served_model_name, 
                    temperature=self.cfg.temperature, 
                    logprobs=self.cfg.top_logprobs, 
                    max_tokens=max_tokens, 
                    prompt=prompt_ids, 
                    seed=attempt_seed, 
                    stream=True, 
                    extra_body={
                        'min_p': self.cfg.min_p, 
                        'stop_token_ids': self.stop_token_ids, 
                        'return_token_ids': True
                    }
                )
    
                try:
                    token_buffer = []
                    text_chunks = []
    
                    for chunk in stream:
                        if stop_event.is_set() or time.time() > deadline:
                            break
    
                        new_tokens = chunk.choices[0].token_ids
                        new_text = chunk.choices[0].text
    
                        if new_tokens:
                            token_buffer.extend(new_tokens)
                            total_tokens += len(new_tokens)
                            text_chunks.append(new_text)
                            
                            chunk_logprobs = chunk.choices[0].logprobs
                            
                            if chunk_logprobs is not None:
                                if chunk_logprobs.top_logprobs:
                                    logprobs_buffer.extend(chunk_logprobs.top_logprobs)
    
                        if '}' in new_text:
                            search_text = ''.join(text_chunks[-self.cfg.search_tokens:])
                            answer = self._scan_for_answer(search_text)
    
                            if answer is not None:
                                final_answer = answer
                                break
    
                finally:
                    stream.close()
    
                if final_answer is not None:
                    break
    
                if not token_buffer:
                    break
    
                new_messages = encoding.parse_messages_from_completion_tokens(token_buffer, Role.ASSISTANT)
                conversation.messages.extend(new_messages)
                last_message = new_messages[-1]
    
                if last_message.channel == 'final':
                    answer_text = last_message.content[0].text
                    final_answer = self._scan_for_answer(answer_text)
                    break
    
                if last_message.recipient == 'python':
                    python_calls += 1
                    tool_responses = local_tool.process_sync_plus(last_message)
    
                    response_text = tool_responses[0].content[0].text
    
                    if response_text.startswith('[ERROR]') or 'Traceback' in response_text or 'Error:' in response_text:
                        python_errors += 1
    
                    conversation.messages.extend(tool_responses)
    
        except Exception as exc:
            python_errors += 1
    
        finally:
            if sandbox is not None:
                sandbox.reset()
                self.sandbox_pool.put(sandbox)
    
        mean_entropy = self._compute_mean_entropy(logprobs_buffer)
    
        return {
            'Attempt': attempt_index + 1, 
            'Response Length': total_tokens, 
            'Python Calls': python_calls, 
            'Python Errors': python_errors, 
            'Entropy': mean_entropy, 
            'Answer': final_answer
        }
    
    def _select_answer(self, detailed_results: list) -> int:

        answer_weights = defaultdict(float)
        answer_votes = defaultdict(int)

        for result in detailed_results:
            answer = result['Answer']
            entropy = result['Entropy']
            
            if answer is not None:
                weight = 1.0 / max(entropy, 1e-9)
                
                answer_weights[answer] += weight
                answer_votes[answer] += 1

        scored_answers = []

        for answer, total_weight in answer_weights.items():
            scored_answers.append({
                'answer': answer, 
                'votes': answer_votes[answer], 
                'score': total_weight
            })

        scored_answers.sort(key=lambda x: x['score'], reverse=True)

        vote_data = []

        for item in scored_answers:
            vote_data.append((
                item['answer'], 
                item['votes'], 
                item['score']
            ))

        vote_dataframe = pd.DataFrame(
            vote_data, 
            columns=['Answer', 'Votes', 'Score']
        )

        vote_dataframe = vote_dataframe.round({'Score': 3})
        display(vote_dataframe)
        
        if not scored_answers:
            print('\nFinal Answer: 0\n')
            return 0

        final_answer = scored_answers[0]['answer']    
        print(f'\nFinal Answer: {final_answer}\n')

        return final_answer
    
    def solve_problem(self, problem: str) -> int:

            print(f'\nProblem: {problem}\n')
        
            user_input = f'{problem} {self.cfg.preference_prompt}'    
        
            elapsed_global = time.time() - self.notebook_start_time
            time_left = self.cfg.notebook_limit - elapsed_global
            problems_left_others = max(0, self.problems_remaining - 1)
            reserved_time = problems_left_others * self.cfg.base_problem_timeout
        
            budget = time_left - reserved_time
            budget = min(budget, self.cfg.high_problem_timeout)
            budget = max(budget, self.cfg.base_problem_timeout)
        
            deadline = time.time() + budget
        
            print(f'Budget: {budget:.2f} seconds | Deadline: {deadline:.2f}\n')
        
            tasks = []
            for attempt_index in range(self.cfg.attempts):
                if attempt_index < 4:
                    system_prompt = self.cfg.ANSWER_ONLY_PROMPT
                else:
                    system_prompt = self.cfg.system_prompt
                tasks.append((system_prompt, attempt_index))
        
            detailed_results = []
            valid_answers = []
        
            stop_event = threading.Event()
            executor = ThreadPoolExecutor(max_workers=self.cfg.workers)
        
            try:
                futures = []
                for (system_prompt, attempt_index) in tasks:
                    futures.append(
                        executor.submit(
                            self._process_attempt,
                            user_input,
                            system_prompt,
                            attempt_index,
                            stop_event,
                            deadline
                        )
                    )
        
                for future in as_completed(futures):
                    try:
                        result = future.result()
                        detailed_results.append(result)
        
                        if result['Answer'] is not None:
                            valid_answers.append(result['Answer'])
        
                        counts = Counter(valid_answers).most_common(1)
                        if counts and counts[0][1] >= self.cfg.early_stop:
                            stop_event.set()
                            for f in futures:
                                f.cancel()
                            break
        
                    except Exception as exc:
                        print(f'Future failed: {exc}')
        
            finally:
                stop_event.set()
                executor.shutdown(wait=True, cancel_futures=True)
                self.problems_remaining = max(0, self.problems_remaining - 1)
        
            if detailed_results:
                df = pd.DataFrame(detailed_results)
                df['Entropy'] = df['Entropy'].round(3)
                df['Answer'] = df['Answer'].astype('Int64')
                display(df)
        
            # ─────────────────────────────
            # NO ANSWERS
            # ─────────────────────────────
            if not valid_answers:
                print('\nResult: 0\n')
                return 0
        
            # ─────────────────────────────
            # HARD ACCEPT: UNANIMOUS ANSWER
            # ─────────────────────────────
            if len(valid_answers) >= 4:
                most_common, count = Counter(valid_answers).most_common(1)[0]
                if count >= 4:
                    print(f"\nUNANIMOUS ANSWER: {most_common}\n")
                    return most_common
        
            # ─────────────────────────────
            # STEP 4: CANDIDATES (≥2 votes)
            # ─────────────────────────────
            answer_counts = Counter(valid_answers)
            candidates = [a for a, c in answer_counts.items() if c >= 2]
        
            # ─────────────────────────────
            # STEP 5: ENTROPY-SORTED VERIFY
            # ─────────────────────────────
            entropy_map = {}
            for r in detailed_results:
                if r['Answer'] is not None and r['Entropy'] is not None:
                    entropy_map.setdefault(r['Answer'], []).append(r['Entropy'])
        
            avg_entropy = {a: sum(v) / len(v) for a, v in entropy_map.items()}
        
            candidates = sorted(candidates, key=lambda x: avg_entropy.get(x, 999))
        
            for ans in candidates:
                try:
                    if self._verify_answer(problem, ans):
                        print(f"\nVERIFIED ANSWER: {ans}\n")
                        return ans
                except Exception:
                    pass
        
            # ─────────────────────────────
            # STEP 6: FALLBACK
            # ─────────────────────────────
            return self._select_answer(detailed_results)

         

    def _verify_answer(self, problem: str, answer: int) -> bool:
           """
           Deterministic verification using model self-check.
           Must return True only if answer is certainly correct.
           """  
           prompt = f"""
            Problem:
              {problem}
               
               Proposed answer: {answer}
              
              Check the answer carefully.
              Reply with only ONE word:
              CORRECT or WRONG
              """

           try:
               resp = self.model.generate(
                   prompt,
                   temperature=0.0,
                   max_tokens=5
               )
       
               text = resp.strip().upper()
               return "CORRECT" in text and "WRONG" not in text
       
           except Exception:
               return False
                     

    def __del__(self):
    
        if hasattr(self, 'server_process'):
            self.server_process.terminate()
            self.server_process.wait()
    
        if hasattr(self, 'log_file'):
            self.log_file.close()
    
        if hasattr(self, 'sandbox_pool'):
            while not self.sandbox_pool.empty():
                try:
                    sb = self.sandbox_pool.get_nowait()
                    sb.close()
    
                except Exception:
                    pass

In [15]:
solver = AIMO3Solver(CFG)

Loading model weights from /kaggle/input/gpt-oss-120b/transformers/default/1 into OS Page Cache...
Processed 26 files (65.28 GB) in 120.67 seconds.

Waiting for vLLM server...
Server is ready (took 129.80 seconds).

Initializing 16 persistent Jupyter kernels...
Kernels initialized in 2.84 seconds.



In [16]:
# def predict(id_: pl.DataFrame, question: pl.DataFrame, answer: Optional[pl.DataFrame] = None) -> pl.DataFrame:
    
#     id_value = id_.item(0)
#     question_text = question.item(0)
    
#     gc.disable()
    
#     final_answer = solver.solve_problem(question_text)
    
#     gc.enable()
#     gc.collect()
    
#     return pl.DataFrame({'id': id_value, 'answer': final_answer})

In [17]:
# inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     inference_server.serve()
    
# else:
#     inference_server.run_local_gateway(
#         ('/kaggle/input/ai-mathematical-olympiad-progress-prize-3/reference.csv',)
#     )

In [18]:
import polars as pl
import pandas as pd
import os

# --- 1. Updated Predict Function ---
def predict(id_: pl.DataFrame, question: pl.DataFrame, answer: Optional[pl.DataFrame] = None) -> pl.DataFrame:
    # Use index-based access to avoid the .item() error
    id_value = id_[0, 0]
    question_text = question[0, 0]
    
    gc.disable()
    final_answer = solver.solve_problem(question_text)
    gc.enable()
    gc.collect()
    
    return pl.DataFrame({'id': id_value, 'answer': final_answer})

# --- 2. Updated Testing Loop ---
FILE_PATH = '/kaggle/input/50problems/50problems.csv'

if not os.path.exists(FILE_PATH):
    print(f"Error: File not found at {FILE_PATH}")
else:
    external_df = pd.read_csv(FILE_PATH)
    test_results = []

    print(f"Starting test on {len(external_df)} problems...\n")

    for idx, row in external_df.iterrows():
        problem_text = row['Problem']
        ground_truth = row['Answer']
        
        # Step 1: Print problem details first
        print(f"{'='*50}")
        print(f"TESTING PROBLEM {idx+1}")
        print(f"Statement: {problem_text}")
        print(f"Ground Truth Answer: {ground_truth}")
        print(f"{'-'*50}")
        
        # Prepare inputs
        id_df = pl.DataFrame({'id': [f"ext_{idx}"]})
        question_df = pl.DataFrame({'question': [problem_text]})
        
        try:
            # Step 2: Generate Answer
            result_pl_df = predict(id_df, question_df)
            
            # Accessing column 'answer' from row 0
            predicted_val = result_pl_df[0, "answer"]
            
            is_correct = (int(predicted_val) == int(ground_truth))
            
            test_results.append({
                "idx": idx + 1,
                "prediction": predicted_val,
                "ground_truth": ground_truth,
                "correct": is_correct
            })
            
            # Step 3: Print result summary before moving to next
            status = "✅ CORRECT" if is_correct else "❌ INCORRECT"
            print(f"\n[Problem {idx+1} Result]")
            print(f"Model Predicted: {predicted_val}")
            print(f"Status: {status}")
            print(f"{'='*50}\n")
            
        except Exception as e:
            print(f"Error on problem {idx+1}: {e}")
            test_results.append({
                "idx": idx + 1, "prediction": None, "ground_truth": ground_truth, "correct": False
            })

    # Final Summary Table
    summary_df = pd.DataFrame(test_results)
    display(summary_df)
    print(f"Overall Accuracy: {summary_df['correct'].mean() * 100:.2f}%")

Starting test on 50 problems...

TESTING PROBLEM 1
Statement: Let $ABC$ be an acute-angled triangle with integer side lengths and $AB<AC$. Points $D$ and $E$ lie on segments $BC$ and $AC$, respectively, such that $AD=AE=AB$. Line $DE$ intersects $AB$ at $X$. Circles $BXD$ and $CED$ intersect for the second time at $Y \neq D$. Suppose that $Y$ lies on line $AD$. There is a unique such triangle with minimal perimeter. This triangle has side lengths $a=BC$, $b=CA$, and $c=AB$. Find the remainder when $abc$ is divided by $10^{5}$.
Ground Truth Answer: 336
--------------------------------------------------

Problem: Let $ABC$ be an acute-angled triangle with integer side lengths and $AB<AC$. Points $D$ and $E$ lie on segments $BC$ and $AC$, respectively, such that $AD=AE=AB$. Line $DE$ intersects $AB$ at $X$. Circles $BXD$ and $CED$ intersect for the second time at $Y \neq D$. Suppose that $Y$ lies on line $AD$. There is a unique such triangle with minimal perimeter. This triangle has side 

Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,8518,6,0,0.618,336
1,4,10214,9,2,0.635,336
2,2,10470,9,1,0.677,336
3,5,12766,12,0,0.58,336
4,3,12455,5,1,0.688,336



UNANIMOUS ANSWER: 336


[Problem 1 Result]
Model Predicted: 336
Status: ✅ CORRECT

TESTING PROBLEM 2
Statement: Define a function $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ by $f(n) = \sum_{i = 1}^n \sum_{j = 1}^n j^{1024} \lfloor\frac1j + \frac{n-i}{n}\rfloor$. Let $M=2 \cdot 3 \cdot 5 \cdot 7 \cdot 11 \cdot 13$ and let $N = f(M^{15}) - f(M^{15}-1)$. Let $k$ be the largest non-negative integer such that $2^k$ divides $N$. What is the remainder when $2^k$ is divided by $5^7$?
Ground Truth Answer: 32951
--------------------------------------------------

Problem: Define a function $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ by $f(n) = \sum_{i = 1}^n \sum_{j = 1}^n j^{1024} \lfloor\frac1j + \frac{n-i}{n}\rfloor$. Let $M=2 \cdot 3 \cdot 5 \cdot 7 \cdot 11 \cdot 13$ and let $N = f(M^{15}) - f(M^{15}-1)$. Let $k$ be the largest non-negative integer such that $2^k$ divides $N$. What is the remainder when $2^k$ is divided by $5^7$?

Budget: 900.00 seconds | Deadline: 1

Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,4621,1,0,0.518,32951
1,4,4746,3,0,0.576,32951
2,7,5362,4,0,0.582,32951
3,5,6029,8,1,0.645,32951
4,1,6336,12,1,0.565,32951



UNANIMOUS ANSWER: 32951


[Problem 2 Result]
Model Predicted: 32951
Status: ✅ CORRECT

TESTING PROBLEM 3
Statement: A tournament is held with $2^{20}$ runners each of which has a different running speed. The competition consists of $20$ rounds. The winner of each race in the $i^{\text{th}}$ round receives $2^{20-i}$ points and the loser gets no points. Let $N$ denote the number of possible orderings of the competitors at the end of the tournament. Let $k$ be the largest positive integer such that $10^k$ divides $N$. What is the remainder when $k$ is divided by $10^{5}$?
Ground Truth Answer: 21818
--------------------------------------------------

Problem: A tournament is held with $2^{20}$ runners each of which has a different running speed. The competition consists of $20$ rounds. The winner of each race in the $i^{\text{th}}$ round receives $2^{20-i}$ points and the loser gets no points. Let $N$ denote the number of possible orderings of the competitors at the end of the tournament

Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,27324,9,0,0.918,0.0
1,6,31100,10,0,0.872,62097.0
2,3,53892,23,0,0.821,62134.0
3,8,53298,28,7,0.836,62134.0
4,4,57874,34,5,0.825,62134.0
5,2,64421,17,1,0.846,
6,5,64707,13,0,0.869,
7,1,64284,29,2,0.778,


Unnamed: 0,Answer,Votes,Score
0,62134,3,3.627
1,62097,1,1.147
2,0,1,1.089



Final Answer: 62134


[Problem 3 Result]
Model Predicted: 62134
Status: ❌ INCORRECT

TESTING PROBLEM 4
Statement: Ken writes a positive integer $n$ on a blackboard. If the number is $m$, he chooses a base $b$, $2 \leq b \leq m$, and replaces $m$ with the sum of its digits in base $b$. Across all $1 \leq n \leq 10^{10^5}$, the largest possible number of moves Ken could make is $M$. What is the remainder when $M$ is divided by $10^{5}$?
Ground Truth Answer: 32193
--------------------------------------------------

Problem: Ken writes a positive integer $n$ on a blackboard. If the number is $m$, he chooses a base $b$, $2 \leq b \leq m$, and replaces $m$ with the sum of its digits in base $b$. Across all $1 \leq n \leq 10^{10^5}$, the largest possible number of moves Ken could make is $M$. What is the remainder when $M$ is divided by $10^{5}$?

Budget: 900.00 seconds | Deadline: 1771773347.79



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,1,4817,7,0,0.75,32193
1,4,6016,3,0,0.701,32193
2,5,8017,11,0,0.709,32193
3,3,10389,13,1,0.741,32193
4,8,10345,12,1,0.726,32193



UNANIMOUS ANSWER: 32193


[Problem 4 Result]
Model Predicted: 32193
Status: ✅ CORRECT

TESTING PROBLEM 5
Statement: Let triangle $ABC$ be $n$-tastic if $BD = F_n, CD = F_{n+1},$ and $KNK'B$ is cyclic, where $K$ is a meeting point of circumcircles and $N$ is the foot of the perpendicular from $D$ to $EF$. Across all $n$-tastic triangles, let $a_n$ be the max value of $\frac{CT \cdot NB}{BT \cdot NE}$. Let $\alpha = p + \sqrt{q}$ be the limit as $n \to \infty$. Find the remainder when $\lfloor p^{q^p} \rfloor$ is divided by $99991$.
Ground Truth Answer: 57447
--------------------------------------------------

Problem: Let triangle $ABC$ be $n$-tastic if $BD = F_n, CD = F_{n+1},$ and $KNK'B$ is cyclic, where $K$ is a meeting point of circumcircles and $N$ is the foot of the perpendicular from $D$ to $EF$. Across all $n$-tastic triangles, let $a_n$ be the max value of $\frac{CT \cdot NB}{BT \cdot NE}$. Let $\alpha = p + \sqrt{q}$ be the limit as $n \to \infty$. Find the remainder when $\

Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,4963,6,0,0.791,57447
1,5,27336,25,7,0.667,57447
2,8,36479,55,16,0.384,57447
3,1,47426,51,4,0.665,38715
4,7,50050,31,3,0.66,57447
5,3,43165,50,13,0.612,57447



UNANIMOUS ANSWER: 57447


[Problem 5 Result]
Model Predicted: 57447
Status: ✅ CORRECT

TESTING PROBLEM 6
Statement: A positive integer is $n$-Norwegian if it has three distinct positive divisors whose sum is $n$. Let $f(n)$ denote the smallest $n$-Norwegian integer. Let $M=3^{2025!}$ and $g(c)=\frac{1}{2025!}\lfloor \frac{2025! f(M+c)}{M}\rfloor$. If $g(0)+g(4M)+g(1848374)+g(10162574)+g(265710644)+g(44636594)=\frac{p}{q}$, find the remainder when $p+q$ is divided by $99991$.
Ground Truth Answer: 8687
--------------------------------------------------

Problem: A positive integer is $n$-Norwegian if it has three distinct positive divisors whose sum is $n$. Let $f(n)$ denote the smallest $n$-Norwegian integer. Let $M=3^{2025!}$ and $g(c)=\frac{1}{2025!}\lfloor \frac{2025! f(M+c)}{M}\rfloor$. If $g(0)+g(4M)+g(1848374)+g(10162574)+g(265710644)+g(44636594)=\frac{p}{q}$, find the remainder when $p+q$ is divided by $99991$.

Budget: 900.00 seconds | Deadline: 1771774038.60



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,26227,26,2,0.722,23.0
1,5,33078,31,3,0.688,6825.0
2,4,34837,42,5,0.69,8687.0
3,3,42476,48,4,0.736,36162.0
4,8,51800,47,3,0.685,78845.0
5,6,58116,34,1,0.674,
6,1,61297,29,4,0.734,
7,7,61224,51,8,0.685,


Unnamed: 0,Answer,Votes,Score
0,78845,1,1.46
1,6825,1,1.454
2,8687,1,1.449
3,23,1,1.386
4,36162,1,1.359



Final Answer: 78845


[Problem 6 Result]
Model Predicted: 78845
Status: ❌ INCORRECT

TESTING PROBLEM 7
Statement: Alice and Bob each hold some sweets. Alice says: If we added our sweets to our positive integer age, my answer would be double yours. If we took the product, my answer would be four times yours. Bob says: Give me five sweets and then both our sum and product would be equal. What is the product of Alice and Bob's ages?
Ground Truth Answer: 50
--------------------------------------------------

Problem: Alice and Bob each hold some sweets. Alice says: If we added our sweets to our positive integer age, my answer would be double yours. If we took the product, my answer would be four times yours. Bob says: Give me five sweets and then both our sum and product would be equal. What is the product of Alice and Bob's ages?

Budget: 900.00 seconds | Deadline: 1771774692.83



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,8,2751,1,0,0.643,50
1,3,3111,2,1,0.582,50
2,4,2767,3,0,0.67,50
3,6,3468,1,0,0.442,50
4,1,3539,1,0,0.716,50



UNANIMOUS ANSWER: 50


[Problem 7 Result]
Model Predicted: 50
Status: ✅ CORRECT

TESTING PROBLEM 8
Statement: Let $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ satisfy $f(m) + f(n) = f(m + n + mn)$ for all $m, n$. Across all functions where $f(n) \leq 1000$ for all $n \leq 1000$, how many different values can $f(2024)$ take?
Ground Truth Answer: 580
--------------------------------------------------

Problem: Let $f \colon \mathbb{Z}_{\geq 1} \to \mathbb{Z}_{\geq 1}$ satisfy $f(m) + f(n) = f(m + n + mn)$ for all $m, n$. Across all functions where $f(n) \leq 1000$ for all $n \leq 1000$, how many different values can $f(2024)$ take?

Budget: 900.00 seconds | Deadline: 1771774726.52



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,3,6777,5,0,0.839,580
1,7,8812,6,0,0.866,580
2,5,9763,15,1,0.751,580
3,4,10032,13,2,0.742,580
4,1,11054,12,1,0.824,580



UNANIMOUS ANSWER: 580


[Problem 8 Result]
Model Predicted: 580
Status: ✅ CORRECT

TESTING PROBLEM 9
Statement: A $500 \times 500$ square is divided into $k$ rectangles with integer side lengths. Given that no two of these rectangles have the same perimeter, the largest possible value of $k$ is $\mathcal{K}$. What is the remainder when $\mathcal{K}$ is divided by $10^{5}$?
Ground Truth Answer: 520
--------------------------------------------------

Problem: A $500 \times 500$ square is divided into $k$ rectangles with integer side lengths. Given that no two of these rectangles have the same perimeter, the largest possible value of $k$ is $\mathcal{K}$. What is the remainder when $\mathcal{K}$ is divided by $10^{5}$?

Budget: 900.00 seconds | Deadline: 1771774843.02



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,12870,1,0,0.948,520
1,4,18210,9,0,0.985,520
2,6,20668,7,0,0.95,520
3,7,25740,20,1,0.952,520
4,3,29073,16,1,0.917,520



UNANIMOUS ANSWER: 520


[Problem 9 Result]
Model Predicted: 520
Status: ✅ CORRECT

TESTING PROBLEM 10
Statement: Let $\mathcal{F}$ be the set of functions $\alpha \colon \mathbb{Z} \to \mathbb{Z}$ with finite support. Define a product $\alpha \star \beta = \sum \alpha(n) \beta(n)$. A function is shifty if $\alpha(m)=0$ for $m<0, m>8$ and there exists $\beta$ such that $S_n(\alpha) \star \beta = 1$ for two distinct shifts and $0$ otherwise. How many shifty functions are there?
Ground Truth Answer: 160
--------------------------------------------------

Problem: Let $\mathcal{F}$ be the set of functions $\alpha \colon \mathbb{Z} \to \mathbb{Z}$ with finite support. Define a product $\alpha \star \beta = \sum \alpha(n) \beta(n)$. A function is shifty if $\alpha(m)=0$ for $m<0, m>8$ and there exists $\beta$ such that $S_n(\alpha) \star \beta = 1$ for two distinct shifts and $0$ otherwise. How many shifty functions are there?

Budget: 900.00 seconds | Deadline: 1771775121.55



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,13697,10,1,0.821,107
1,3,14313,8,0,0.752,266
2,4,14287,13,3,0.794,44
3,5,16282,21,3,0.754,266
4,8,21158,17,2,0.773,44
5,1,21275,32,4,0.724,160
6,2,25053,18,5,0.766,214
7,7,25366,23,5,0.769,160


Unnamed: 0,Answer,Votes,Score
0,160,2,2.683
1,266,2,2.657
2,44,2,2.552
3,214,1,1.306
4,107,1,1.218



Final Answer: 160


[Problem 10 Result]
Model Predicted: 160
Status: ✅ CORRECT

TESTING PROBLEM 11
Statement: Every morning Aya goes for a 9-km walk. At speed $s$ km/h, it takes 4 hours including $t$ minutes at a shop. At $s+2$ km/h, it takes 2 hours 24 minutes including $t$ minutes. If she walks at $s+0.5$ km/h, find the total number of minutes the walk takes including the coffee shop.
Ground Truth Answer: 204
--------------------------------------------------

Problem: Every morning Aya goes for a 9-km walk. At speed $s$ km/h, it takes 4 hours including $t$ minutes at a shop. At $s+2$ km/h, it takes 2 hours 24 minutes including $t$ minutes. If she walks at $s+0.5$ km/h, find the total number of minutes the walk takes including the coffee shop.

Budget: 900.00 seconds | Deadline: 1771775365.10



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,663,2,0,0.564,204
1,1,944,0,0,0.417,204
2,4,971,0,0,0.438,204
3,2,1001,1,0,0.525,204
4,3,1142,0,0,0.386,204



UNANIMOUS ANSWER: 204


[Problem 11 Result]
Model Predicted: 204
Status: ✅ CORRECT

TESTING PROBLEM 12
Statement: There exist real numbers $x, y > 1$ such that $x^{\log_x y} = \log_y (x^4 y) = 10$. Find $xy$.
Ground Truth Answer: 25
--------------------------------------------------

Problem: There exist real numbers $x, y > 1$ such that $x^{\log_x y} = \log_y (x^4 y) = 10$. Find $xy$.

Budget: 900.00 seconds | Deadline: 1771775375.53



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,801,0,0,0.621,1778.0
1,3,930,3,0,0.52,
2,1,982,2,0,0.511,
3,4,1029,1,0,0.556,
4,2,1162,1,0,0.613,
5,6,1839,2,0,0.744,
6,8,3135,3,0,0.713,
7,5,4118,2,0,0.747,


Unnamed: 0,Answer,Votes,Score
0,1778,1,1.611



Final Answer: 1778


[Problem 12 Result]
Model Predicted: 1778
Status: ❌ INCORRECT

TESTING PROBLEM 13
Statement: Alice and Bob play a game with $n$ tokens. They take turns removing 1 or 4 tokens. The player who removes the last token wins. Find the number of positive integers $n \leq 2024$ for which Bob has a winning strategy regardless of Alice's moves.
Ground Truth Answer: 809
--------------------------------------------------

Problem: Alice and Bob play a game with $n$ tokens. They take turns removing 1 or 4 tokens. The player who removes the last token wins. Find the number of positive integers $n \leq 2024$ for which Bob has a winning strategy regardless of Alice's moves.

Budget: 900.00 seconds | Deadline: 1771775403.63



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,1764,4,0,0.651,809
1,5,1962,4,0,0.679,809
2,3,2443,4,0,0.623,809
3,1,2498,4,0,0.669,809
4,4,2517,1,0,0.601,809



UNANIMOUS ANSWER: 809


[Problem 13 Result]
Model Predicted: 809
Status: ✅ CORRECT

TESTING PROBLEM 14
Statement: Jen picks 4 distinct numbers from $S=\{1,2,\dots,10\}$. 4 numbers are drawn randomly from $S$. She wins a prize if at least two match. The probability of winning the grand prize (all 4 match) given she wins a prize is $m/n$. Find $m+n$.
Ground Truth Answer: 116
--------------------------------------------------

Problem: Jen picks 4 distinct numbers from $S=\{1,2,\dots,10\}$. 4 numbers are drawn randomly from $S$. She wins a prize if at least two match. The probability of winning the grand prize (all 4 match) given she wins a prize is $m/n$. Find $m+n$.

Budget: 900.00 seconds | Deadline: 1771775428.56



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,1001,0,0,0.625,116
1,4,1201,0,0,0.715,116
2,1,1269,1,0,0.699,116
3,3,1284,3,0,0.633,116
4,6,1642,1,0,0.527,116



UNANIMOUS ANSWER: 116


[Problem 14 Result]
Model Predicted: 116
Status: ✅ CORRECT

TESTING PROBLEM 15
Statement: Rectangle $ABCD$ has dimensions $107 \times 16$, and rectangle $EFGH$ has $184 \times 17$. $D, E, C, F$ lie on a line in that order. If $A, D, H, G$ lie on a common circle, find $CE$.
Ground Truth Answer: 104
--------------------------------------------------

Problem: Rectangle $ABCD$ has dimensions $107 \times 16$, and rectangle $EFGH$ has $184 \times 17$. $D, E, C, F$ lie on a line in that order. If $A, D, H, G$ lie on a common circle, find $CE$.

Budget: 900.00 seconds | Deadline: 1771775444.13



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,3723,4,0,0.803,104
1,2,4517,4,0,0.617,104
2,3,6420,6,0,0.701,104
3,1,6511,5,0,0.815,104
4,6,7379,12,1,0.73,104



UNANIMOUS ANSWER: 104


[Problem 15 Result]
Model Predicted: 104
Status: ✅ CORRECT

TESTING PROBLEM 16
Statement: Consider paths of length 16 on an $8 \times 8$ grid from the lower-left to the upper-right corner. Find the number of such paths that change direction exactly four times.
Ground Truth Answer: 294
--------------------------------------------------

Problem: Consider paths of length 16 on an $8 \times 8$ grid from the lower-left to the upper-right corner. Find the number of such paths that change direction exactly four times.

Budget: 900.00 seconds | Deadline: 1771775512.58



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,4,1127,1,0,0.706,294
1,1,1216,1,0,0.691,294
2,2,1310,1,0,0.691,294
3,3,1426,1,0,0.772,294
4,8,1514,1,0,0.82,294



UNANIMOUS ANSWER: 294


[Problem 16 Result]
Model Predicted: 294
Status: ✅ CORRECT

TESTING PROBLEM 17
Statement: Eight circles of radius 34 can be placed tangent to $BC$ of $\triangle ABC$ sequentially tangent to each other, first to $AB$ and last to $AC$. Similarly, 2024 circles of radius 1 can be placed the same way. Find $m+n$ if the inradius is $m/n$.
Ground Truth Answer: 540
--------------------------------------------------

Problem: Eight circles of radius 34 can be placed tangent to $BC$ of $\triangle ABC$ sequentially tangent to each other, first to $AB$ and last to $AC$. Similarly, 2024 circles of radius 1 can be placed the same way. Find $m+n$ if the inradius is $m/n$.

Budget: 900.00 seconds | Deadline: 1771775527.48



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,6465,4,0,0.721,197
1,7,7652,3,1,0.71,197
2,1,9011,5,0,0.655,197
3,3,9875,4,0,0.718,197
4,6,10608,7,0,0.831,197



UNANIMOUS ANSWER: 197


[Problem 17 Result]
Model Predicted: 197
Status: ❌ INCORRECT

TESTING PROBLEM 18
Statement: Tetrahedron $ABCD$ has $AB=CD=\sqrt{41}$, $AC=BD=\sqrt{80}$, and $BC=AD=\sqrt{89}$. A point $I$ is equidistant from all faces. If this distance is $\frac{m\sqrt{n}}{p}$, find $m+n+p$.
Ground Truth Answer: 197
--------------------------------------------------

Problem: Tetrahedron $ABCD$ has $AB=CD=\sqrt{41}$, $AC=BD=\sqrt{80}$, and $BC=AD=\sqrt{89}$. A point $I$ is equidistant from all faces. If this distance is $\frac{m\sqrt{n}}{p}$, find $m+n+p$.

Budget: 900.00 seconds | Deadline: 1771775624.33



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,3,2418,3,0,0.571,104
1,7,4316,4,0,0.455,104
2,1,6612,9,0,0.661,104
3,4,6862,18,0,0.487,104
4,5,8668,11,0,0.558,104



UNANIMOUS ANSWER: 104


[Problem 18 Result]
Model Predicted: 104
Status: ❌ INCORRECT

TESTING PROBLEM 19
Statement: Triangle $ABC$ is inscribed in $\omega$. Tangents to $\omega$ at $B, C$ intersect at $D$. $AD$ intersects $\omega$ at $P$. If $AB=5, BC=9, AC=10$, and $AP=m/n$, find $m+n$.
Ground Truth Answer: 113
--------------------------------------------------

Problem: Triangle $ABC$ is inscribed in $\omega$. Tangents to $\omega$ at $B, C$ intersect at $D$. $AD$ intersects $\omega$ at $P$. If $AB=5, BC=9, AC=10$, and $AP=m/n$, find $m+n$.

Budget: 900.00 seconds | Deadline: 1771775704.33



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,1,4970,9,0,0.541,113
1,2,5544,7,1,0.741,113
2,8,9521,12,0,0.754,113
3,5,11758,27,1,0.69,113
4,4,12411,28,1,0.674,113



UNANIMOUS ANSWER: 113


[Problem 19 Result]
Model Predicted: 113
Status: ✅ CORRECT

TESTING PROBLEM 20
Statement: Among the 900 residents of Aimeville, 195 own a diamond ring, 367 own golf clubs, and 562 own a spade. All own candy hearts. 437 own exactly two things, and 234 own exactly three. Find the number who own all four.
Ground Truth Answer: 73
--------------------------------------------------

Problem: Among the 900 residents of Aimeville, 195 own a diamond ring, 367 own golf clubs, and 562 own a spade. All own candy hearts. 437 own exactly two things, and 234 own exactly three. Find the number who own all four.

Budget: 900.00 seconds | Deadline: 1771775821.37



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,4,1801,0,0,0.555,73
1,7,2158,0,0,0.571,73
2,1,2166,1,0,0.55,73
3,3,2648,0,0,0.531,73
4,8,3308,0,0,0.596,73



UNANIMOUS ANSWER: 73


[Problem 20 Result]
Model Predicted: 73
Status: ✅ CORRECT

TESTING PROBLEM 21
Statement: A list of positive integers has sum 30 and unique mode 9. The median is a positive integer that does not appear in the list. Find the sum of the squares of all items in the list.
Ground Truth Answer: 236
--------------------------------------------------

Problem: A list of positive integers has sum 30 and unique mode 9. The median is a positive integer that does not appear in the list. Find the sum of the squares of all items in the list.

Budget: 900.00 seconds | Deadline: 1771775850.80



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,1680,4,0,0.743,236
1,8,2096,3,0,0.769,236
2,1,2609,3,0,0.847,236
3,4,2871,2,0,0.861,236
4,5,3212,6,0,0.812,236



UNANIMOUS ANSWER: 236


[Problem 21 Result]
Model Predicted: 236
Status: ✅ CORRECT

TESTING PROBLEM 22
Statement: Find the number of ways to place a digit in each cell of a $2 \times 3$ grid so the sum of the two 3-digit numbers reading left to right is 999, and the sum of the three 2-digit numbers reading top to bottom is 99.
Ground Truth Answer: 236
--------------------------------------------------

Problem: Find the number of ways to place a digit in each cell of a $2 \times 3$ grid so the sum of the two 3-digit numbers reading left to right is 999, and the sum of the three 2-digit numbers reading top to bottom is 99.

Budget: 900.00 seconds | Deadline: 1771775880.82



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,1,3151,4,0,0.668,21
1,8,3180,1,0,0.701,21
2,6,3860,3,0,0.721,21
3,5,4133,1,0,0.657,21
4,3,5119,5,0,0.656,21



UNANIMOUS ANSWER: 21


[Problem 22 Result]
Model Predicted: 21
Status: ❌ INCORRECT

TESTING PROBLEM 23
Statement: Positive real numbers $x, y, z$ satisfy $\log_2(x/yz)=1/2$, $\log_2(y/xz)=1/3$, and $\log_2(z/xy)=1/4$. If $|\log_2(x^4 y^3 z^2)| = m/n$, find $m+n$.
Ground Truth Answer: 33
--------------------------------------------------

Problem: Positive real numbers $x, y, z$ satisfy $\log_2(x/yz)=1/2$, $\log_2(y/xz)=1/3$, and $\log_2(z/xy)=1/4$. If $|\log_2(x^4 y^3 z^2)| = m/n$, find $m+n$.

Budget: 900.00 seconds | Deadline: 1771775927.90



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,1256,3,0,0.418,33
1,2,1431,3,0,0.362,33
2,1,1514,1,0,0.301,33
3,3,1756,3,0,0.349,33
4,4,1980,2,0,0.382,33



UNANIMOUS ANSWER: 33


[Problem 23 Result]
Model Predicted: 33
Status: ✅ CORRECT

TESTING PROBLEM 24
Statement: Hexagon $ABCDEF$ is convex equilateral with opposite sides parallel. Side extensions of $AB, CD, EF$ form a triangle with side lengths 200, 240, and 300. Find the side length of the hexagon.
Ground Truth Answer: 80
--------------------------------------------------

Problem: Hexagon $ABCDEF$ is convex equilateral with opposite sides parallel. Side extensions of $AB, CD, EF$ form a triangle with side lengths 200, 240, and 300. Find the side length of the hexagon.

Budget: 900.00 seconds | Deadline: 1771775946.35



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,3,15991,13,1,0.655,80
1,2,21118,10,1,0.71,80
2,7,20043,22,2,0.656,80
3,4,22119,21,0,0.655,80
4,6,23230,33,2,0.624,80



UNANIMOUS ANSWER: 80


[Problem 24 Result]
Model Predicted: 80
Status: ✅ CORRECT

TESTING PROBLEM 25
Statement: Alice chooses set $A$ of positive integers. Bob lists all finite nonempty sets $B$ where $\max(B) \in A$. Bob's list has 2024 sets. Find the sum of the elements of $A$.
Ground Truth Answer: 55
--------------------------------------------------

Problem: Alice chooses set $A$ of positive integers. Bob lists all finite nonempty sets $B$ where $\max(B) \in A$. Bob's list has 2024 sets. Find the sum of the elements of $A$.

Budget: 900.00 seconds | Deadline: 1771776217.62



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,1,1111,2,0,0.604,55
1,2,1567,4,0,0.695,55
2,4,1668,4,0,0.678,55
3,5,1760,3,0,0.715,55
4,3,1832,3,0,0.657,55



UNANIMOUS ANSWER: 55


[Problem 25 Result]
Model Predicted: 55
Status: ✅ CORRECT

TESTING PROBLEM 26
Statement: Let $N$ be the greatest four-digit integer such that whenever one digit is changed to 1, the result is divisible by 7. If $Q$ and $R$ are the quotient and remainder when $N$ is divided by 1000, find $Q+R$.
Ground Truth Answer: 699
--------------------------------------------------

Problem: Let $N$ be the greatest four-digit integer such that whenever one digit is changed to 1, the result is divisible by 7. If $Q$ and $R$ are the quotient and remainder when $N$ is divided by 1000, find $Q+R$.

Budget: 900.00 seconds | Deadline: 1771776235.82



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,5,3166,3,0,0.557,699
1,3,3283,1,0,0.549,699
2,7,3999,5,0,0.412,699
3,1,4056,3,0,0.527,699
4,8,4332,7,0,0.645,699



UNANIMOUS ANSWER: 699


[Problem 26 Result]
Model Predicted: 699
Status: ✅ CORRECT

TESTING PROBLEM 27
Statement: Find the number of triples of nonnegative integers $(a, b, c)$ satisfying $a + b + c = 300$ and $a^2 b + a^2 c + b^2 a + b^2 c + c^2 a + c^2 b = 6,000,000$.
Ground Truth Answer: 601
--------------------------------------------------

Problem: Find the number of triples of nonnegative integers $(a, b, c)$ satisfying $a + b + c = 300$ and $a^2 b + a^2 c + b^2 a + b^2 c + c^2 a + c^2 b = 6,000,000$.

Budget: 900.00 seconds | Deadline: 1771776277.09



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,3523,2,0,0.533,601
1,2,4263,5,0,0.529,601
2,4,4311,4,0,0.599,601
3,6,4592,9,0,0.562,601
4,3,5246,5,0,0.583,601



UNANIMOUS ANSWER: 601


[Problem 27 Result]
Model Predicted: 601
Status: ✅ CORRECT

TESTING PROBLEM 28
Statement: Let $b \geq 2$. Call a positive integer $b$-eautiful if it has exactly two digits in base $b$ that sum to $\sqrt{n}$. Find the least integer $b$ for which there are more than ten $b$-eautiful integers.
Ground Truth Answer: 211
--------------------------------------------------

Problem: Let $b \geq 2$. Call a positive integer $b$-eautiful if it has exactly two digits in base $b$ that sum to $\sqrt{n}$. Find the least integer $b$ for which there are more than ten $b$-eautiful integers.

Budget: 900.00 seconds | Deadline: 1771776327.03



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,4497,6,0,0.691,211
1,5,6013,4,0,0.728,211
2,3,6746,15,1,0.636,211
3,4,7148,9,0,0.657,211
4,6,7154,8,0,0.705,211



UNANIMOUS ANSWER: 211


[Problem 28 Result]
Model Predicted: 211
Status: ✅ CORRECT

TESTING PROBLEM 29
Statement: Find the number of rectangles formed inside a regular 12-gon where each side lies on either a side or a diagonal of the dodecagon.
Ground Truth Answer: 315
--------------------------------------------------

Problem: Find the number of rectangles formed inside a regular 12-gon where each side lies on either a side or a diagonal of the dodecagon.

Budget: 900.00 seconds | Deadline: 1771776396.05



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,4,4080,3,1,0.742,15
1,5,4091,5,0,0.757,15
2,6,20682,28,2,0.732,27
3,8,26416,22,1,0.749,315
4,1,31211,3,0,0.811,39
5,2,33554,23,0,0.823,27
6,7,42758,35,2,0.799,315
7,3,53112,71,6,0.64,195


Unnamed: 0,Answer,Votes,Score
0,15,2,2.668
1,315,2,2.587
2,27,2,2.581
3,195,1,1.563
4,39,1,1.233



Final Answer: 15


[Problem 29 Result]
Model Predicted: 15
Status: ❌ INCORRECT

TESTING PROBLEM 30
Statement: Five men and nine women stand in a circle. The probability that every man stands diametrically opposite a woman is $m/n$. Find $m+n$.
Ground Truth Answer: 191
--------------------------------------------------

Problem: Five men and nine women stand in a circle. The probability that every man stands diametrically opposite a woman is $m/n$. Find $m+n$.

Budget: 900.00 seconds | Deadline: 1771776844.13



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,1955,5,0,0.836,191
1,3,2325,6,0,0.875,191
2,8,2732,2,0,0.897,191
3,1,2694,3,0,0.856,191
4,4,2929,8,0,0.819,191



UNANIMOUS ANSWER: 191


[Problem 30 Result]
Model Predicted: 191
Status: ✅ CORRECT

TESTING PROBLEM 31
Statement: Real numbers $b \neq 1$ and $n$ satisfy $\sqrt{\log_b n} = \log_b \sqrt{n}$ and $b \cdot \log_b n = \log_b (bn)$. If $n=j/k$, find $j+k$.
Ground Truth Answer: 881
--------------------------------------------------

Problem: Real numbers $b \neq 1$ and $n$ satisfy $\sqrt{\log_b n} = \log_b \sqrt{n}$ and $b \cdot \log_b n = \log_b (bn)$. If $n=j/k$, find $j+k$.

Budget: 900.00 seconds | Deadline: 1771776873.72



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,882,0,0,0.503,881
1,4,1001,0,0,0.658,881
2,1,1135,0,0,0.67,881
3,3,1312,2,0,0.654,881
4,5,1856,1,0,0.596,881



UNANIMOUS ANSWER: 881


[Problem 31 Result]
Model Predicted: 881
Status: ✅ CORRECT

TESTING PROBLEM 32
Statement: A plane contains 40 lines, no 2 parallel. There are points where 3, 4, 5, or 6 lines intersect. Find the number of points where exactly 2 lines intersect.
Ground Truth Answer: 607
--------------------------------------------------

Problem: A plane contains 40 lines, no 2 parallel. There are points where 3, 4, 5, or 6 lines intersect. Find the number of points where exactly 2 lines intersect.

Budget: 900.00 seconds | Deadline: 1771776890.18



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,6001,0,0,0.806,746.0
1,2,8670,0,0,0.816,746.0
2,5,9201,0,0,0.741,746.0
3,3,11001,0,0,0.794,746.0
4,7,11930,2,0,0.802,607.0
5,8,12201,0,0,0.803,669.0
6,1,13332,0,0,0.739,
7,4,23317,3,0,0.748,



UNANIMOUS ANSWER: 746


[Problem 32 Result]
Model Predicted: 746
Status: ❌ INCORRECT

TESTING PROBLEM 33
Statement: The sum of all positive integers $m$ such that $13!/m$ is a perfect square is $2^a 3^b 5^c 7^d 11^e 13^f$. Find $a+b+c+d+e+f$.
Ground Truth Answer: 12
--------------------------------------------------

Problem: The sum of all positive integers $m$ such that $13!/m$ is a perfect square is $2^a 3^b 5^c 7^d 11^e 13^f$. Find $a+b+c+d+e+f$.

Budget: 900.00 seconds | Deadline: 1771777059.22



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,3,1723,3,0,0.421,12
1,2,1843,3,0,0.488,12
2,1,1847,5,0,0.51,12
3,8,2200,5,0,0.452,12
4,4,2721,5,1,0.58,12



UNANIMOUS ANSWER: 12


[Problem 33 Result]
Model Predicted: 12
Status: ✅ CORRECT

TESTING PROBLEM 34
Statement: Point $P$ is on the circumcircle of square $ABCD$ such that $PA \cdot PC = 56$ and $PB \cdot PD = 90$. Find the area of the square.
Ground Truth Answer: 106
--------------------------------------------------

Problem: Point $P$ is on the circumcircle of square $ABCD$ such that $PA \cdot PC = 56$ and $PB \cdot PD = 90$. Find the area of the square.

Budget: 900.00 seconds | Deadline: 1771777084.09



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,4,1961,2,0,0.535,106
1,6,3066,4,0,0.462,106
2,7,3508,6,0,0.617,106
3,2,3522,13,0,0.426,106
4,3,5498,2,0,0.481,106



UNANIMOUS ANSWER: 106


[Problem 34 Result]
Model Predicted: 106
Status: ✅ CORRECT

TESTING PROBLEM 35
Statement: Alice knows 3 red and 3 black cards revealed in random order. Alice guesses color before each. If playing optimally, the expected correct guesses is $m/n$. Find $m+n$.
Ground Truth Answer: 51
--------------------------------------------------

Problem: Alice knows 3 red and 3 black cards revealed in random order. Alice guesses color before each. If playing optimally, the expected correct guesses is $m/n$. Find $m+n$.

Budget: 900.00 seconds | Deadline: 1771777131.07



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,2382,5,1,0.511,51
1,7,2469,4,1,0.636,51
2,3,2488,4,0,0.781,51
3,4,2591,4,0,0.741,51
4,5,3262,5,0,0.74,51



UNANIMOUS ANSWER: 51


[Problem 35 Result]
Model Predicted: 51
Status: ✅ CORRECT

TESTING PROBLEM 36
Statement: Call a positive integer extra-distinct if remainders when divided by 2, 3, 4, 5, and 6 are distinct. Find the number of extra-distinct positive integers less than 1000.
Ground Truth Answer: 49
--------------------------------------------------

Problem: Call a positive integer extra-distinct if remainders when divided by 2, 3, 4, 5, and 6 are distinct. Find the number of extra-distinct positive integers less than 1000.

Budget: 900.00 seconds | Deadline: 1771777162.88



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,3,1725,2,0,0.62,49
1,1,2273,2,0,0.678,49
2,7,3974,4,1,0.542,49
3,6,4171,5,0,0.618,49
4,4,4219,7,0,0.604,49



UNANIMOUS ANSWER: 49


[Problem 36 Result]
Model Predicted: 49
Status: ✅ CORRECT

TESTING PROBLEM 37
Statement: Find the number of cubic polynomials $x^3+ax^2+bx+c$ with $a,b,c \in \{-20, \dots, 20\}$ such that there is a unique integer $m \neq 2$ with $p(m)=p(2)$.
Ground Truth Answer: 738
--------------------------------------------------

Problem: Find the number of cubic polynomials $x^3+ax^2+bx+c$ with $a,b,c \in \{-20, \dots, 20\}$ such that there is a unique integer $m \neq 2$ with $p(m)=p(2)$.

Budget: 900.00 seconds | Deadline: 1771777202.75



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,5,5795,5,0,0.741,738
1,7,6252,3,0,0.748,738
2,4,7688,7,0,0.666,738
3,6,8035,6,1,0.683,738
4,1,8624,5,0,0.724,738



UNANIMOUS ANSWER: 738


[Problem 37 Result]
Model Predicted: 738
Status: ✅ CORRECT

TESTING PROBLEM 38
Statement: Find $a+U$ for the unique $a$ where $U = \sum_{n=1}^{2023} \lfloor (n^2-na)/5 \rfloor$ is an integer strictly between -1000 and 1000.
Ground Truth Answer: 944
--------------------------------------------------

Problem: Find $a+U$ for the unique $a$ where $U = \sum_{n=1}^{2023} \lfloor (n^2-na)/5 \rfloor$ is an integer strictly between -1000 and 1000.

Budget: 900.00 seconds | Deadline: 1771777283.74



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,4,1421,3,0,0.81,944
1,2,2190,7,0,0.812,944
2,3,2555,5,0,0.753,944
3,1,4440,5,0,0.742,944
4,7,4340,7,0,0.642,944



UNANIMOUS ANSWER: 944


[Problem 38 Result]
Model Predicted: 944
Status: ✅ CORRECT

TESTING PROBLEM 39
Statement: Each face of two noncongruent parallelepipeds is a rhombus with diagonals $\sqrt{21}$ and $\sqrt{31}$. If the volume ratio is $m/n$, find $m+n$.
Ground Truth Answer: 125
--------------------------------------------------

Problem: Each face of two noncongruent parallelepipeds is a rhombus with diagonals $\sqrt{21}$ and $\sqrt{31}$. If the volume ratio is $m/n$, find $m+n$.

Budget: 900.00 seconds | Deadline: 1771777328.75



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,3844,2,0,0.711,125
1,3,4001,0,0,0.699,125
2,1,5138,0,0,0.684,125
3,6,7001,0,0,0.742,125
4,4,7238,1,0,0.729,125



UNANIMOUS ANSWER: 125


[Problem 39 Result]
Model Predicted: 125
Status: ✅ CORRECT

TESTING PROBLEM 40
Statement: Find the greatest integer less than 1000 that is a palindrome in both base 10 and base 8.
Ground Truth Answer: 585
--------------------------------------------------

Problem: Find the greatest integer less than 1000 that is a palindrome in both base 10 and base 8.

Budget: 900.00 seconds | Deadline: 1771777394.39



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,741,4,1,0.634,585
1,4,815,5,0,0.676,585
2,3,943,5,0,0.513,585
3,1,1032,4,1,0.602,585
4,5,1043,4,0,0.771,585



UNANIMOUS ANSWER: 585


[Problem 40 Result]
Model Predicted: 585
Status: ✅ CORRECT

TESTING PROBLEM 41
Statement: A region is formed by three unit squares in an L-shape. Two points are chosen randomly. Find $m+n$ if the probability their midpoint is inside the region is $m/n$.
Ground Truth Answer: 35
--------------------------------------------------

Problem: A region is formed by three unit squares in an L-shape. Two points are chosen randomly. Find $m+n$ if the probability their midpoint is inside the region is $m/n$.

Budget: 900.00 seconds | Deadline: 1771777405.87



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,5,4982,2,0,0.589,35.0
1,4,6381,2,0,0.696,35.0
2,6,6561,4,0,0.688,35.0
3,2,6939,4,0,0.68,35.0
4,8,10476,6,1,0.603,
5,7,19901,8,2,0.563,35.0



UNANIMOUS ANSWER: 35


[Problem 41 Result]
Model Predicted: 35
Status: ✅ CORRECT

TESTING PROBLEM 42
Statement: Each vertex of a regular 12-gon is colored red or blue. Find the number of colorings where no four vertices of the same color form a rectangle.
Ground Truth Answer: 928
--------------------------------------------------

Problem: Each vertex of a regular 12-gon is colored red or blue. Find the number of colorings where no four vertices of the same color form a rectangle.

Budget: 900.00 seconds | Deadline: 1771777576.02



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,2409,1,0,0.764,928
1,7,2529,2,0,0.705,928
2,2,2661,1,0,0.792,928
3,4,2863,1,0,0.799,928
4,3,3204,1,0,0.837,928



UNANIMOUS ANSWER: 928


[Problem 42 Result]
Model Predicted: 928
Status: ✅ CORRECT

TESTING PROBLEM 43
Statement: Let $\omega$ be a 7th root of unity. Find the value of the product $\prod_{k=0}^6 (\omega^{3k} + \omega^k + 1)$.
Ground Truth Answer: 24
--------------------------------------------------

Problem: Let $\omega$ be a 7th root of unity. Find the value of the product $\prod_{k=0}^6 (\omega^{3k} + \omega^k + 1)$.

Budget: 900.00 seconds | Deadline: 1771777605.55



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,3,1720,7,0,0.618,24
1,7,2070,9,1,0.71,24
2,2,2306,4,0,0.685,24
3,6,2479,3,0,0.577,24
4,4,3288,5,0,0.696,24



UNANIMOUS ANSWER: 24


[Problem 43 Result]
Model Predicted: 24
Status: ✅ CORRECT

TESTING PROBLEM 44
Statement: Circles $\omega_1, \omega_2$ intersect at $P, Q$. Parallel line $AB$ through $P$ forms trapezoid $XABY$. If $PX=10, PY=14, PQ=5$, find $m+n$ if the area is $m\sqrt{n}$.
Ground Truth Answer: 33
--------------------------------------------------

Problem: Circles $\omega_1, \omega_2$ intersect at $P, Q$. Parallel line $AB$ through $P$ forms trapezoid $XABY$. If $PX=10, PY=14, PQ=5$, find $m+n$ if the area is $m\sqrt{n}$.

Budget: 900.00 seconds | Deadline: 1771777635.14



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,6,9988,3,0,0.753,46.0
1,1,18013,5,0,0.722,33.0
2,4,47965,34,1,0.737,250.0
3,3,52320,23,1,0.689,121.0
4,5,60853,57,1,0.708,33.0
5,2,60381,68,6,0.632,
6,7,62258,48,1,0.685,
7,8,63876,33,1,0.665,


Unnamed: 0,Answer,Votes,Score
0,33,2,2.797
1,121,1,1.452
2,250,1,1.357
3,46,1,1.328



Final Answer: 33


[Problem 44 Result]
Model Predicted: 33
Status: ✅ CORRECT

TESTING PROBLEM 45
Statement: For positive integer $n$, let $a_n$ be the least multiple of 23 with $a_n \equiv 1 \pmod{2^n}$. Find the number of $n \leq 1000$ such that $a_n = a_{n+1}$.
Ground Truth Answer: 363
--------------------------------------------------

Problem: For positive integer $n$, let $a_n$ be the least multiple of 23 with $a_n \equiv 1 \pmod{2^n}$. Find the number of $n \leq 1000$ such that $a_n = a_{n+1}$.

Budget: 900.00 seconds | Deadline: 1771778307.84



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,1,4507,5,0,0.6,363
1,7,4757,7,0,0.656,363
2,2,5420,8,0,0.605,363
3,8,7181,11,0,0.639,363
4,3,7348,9,0,0.62,363



UNANIMOUS ANSWER: 363


[Problem 45 Result]
Model Predicted: 363
Status: ✅ CORRECT

TESTING PROBLEM 46
Statement: Right square pyramid volume 54 has base side 6. If vertices lie on a sphere of radius $m/n$, find $m+n$.
Ground Truth Answer: 21
--------------------------------------------------

Problem: Right square pyramid volume 54 has base side 6. If vertices lie on a sphere of radius $m/n$, find $m+n$.

Budget: 900.00 seconds | Deadline: 1771778377.46



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,8,1001,0,0,0.558,21.0
1,2,1358,1,0,0.688,21.0
2,1,1601,0,0,0.676,21.0
3,7,1700,1,0,0.482,
4,3,1723,3,1,0.726,21.0
5,6,1745,1,0,0.552,21.0



UNANIMOUS ANSWER: 21


[Problem 46 Result]
Model Predicted: 21
Status: ✅ CORRECT

TESTING PROBLEM 47
Statement: Find the least value of $a+b$ for real $a>4, b>1$ satisfying $x^2/a^2 + y^2/(a^2-16) = (x-20)^2/(b^2-1) + (y-11)^2/b^2 = 1$.
Ground Truth Answer: 23
--------------------------------------------------

Problem: Find the least value of $a+b$ for real $a>4, b>1$ satisfying $x^2/a^2 + y^2/(a^2-16) = (x-20)^2/(b^2-1) + (y-11)^2/b^2 = 1$.

Budget: 900.00 seconds | Deadline: 1771778393.40



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,7,12146,16,4,0.67,23
1,1,12595,19,5,0.578,23
2,4,14451,15,3,0.612,23
3,5,23252,23,3,0.674,23
4,3,22922,31,5,0.673,23



UNANIMOUS ANSWER: 23


[Problem 47 Result]
Model Predicted: 23
Status: ✅ CORRECT

TESTING PROBLEM 48
Statement: Twenty points on a circle are labeled 1-20. Segments are drawn between points whose labels differ by a prime. Find the number of triangles formed.
Ground Truth Answer: 72
--------------------------------------------------

Problem: Twenty points on a circle are labeled 1-20. Segments are drawn between points whose labels differ by a prime. Find the number of triangles formed.

Budget: 900.00 seconds | Deadline: 1771778650.46



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,3,2066,6,0,0.75,72
1,5,2791,2,0,0.808,72
2,7,2801,3,0,0.827,72
3,4,2994,2,0,0.921,72
4,2,3744,5,0,0.817,72



UNANIMOUS ANSWER: 72


[Problem 48 Result]
Model Predicted: 72
Status: ✅ CORRECT

TESTING PROBLEM 49
Statement: Find the remainder when $N$ is divided by 1000, where $N$ is the number of sequences of 144 independent hand movements on an analog clock returning to 12.
Ground Truth Answer: 608
--------------------------------------------------

Problem: Find the remainder when $N$ is divided by 1000, where $N$ is the number of sequences of 144 independent hand movements on an analog clock returning to 12.

Budget: 900.00 seconds | Deadline: 1771778685.22



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,4,2851,8,2,0.856,950
1,3,3737,4,0,0.849,950
2,2,3899,5,0,0.881,528
3,5,4019,4,0,0.88,528
4,7,4866,6,0,0.834,950
5,6,5186,7,0,0.764,528
6,1,5227,8,0,0.88,950
7,8,6233,7,0,0.766,950



UNANIMOUS ANSWER: 950


[Problem 49 Result]
Model Predicted: 950
Status: ❌ INCORRECT

TESTING PROBLEM 50
Statement: What is the maximum number of terms in an arithmetic sequence of primes with a common difference of 6?
Ground Truth Answer: 5
--------------------------------------------------

Problem: What is the maximum number of terms in an arithmetic sequence of primes with a common difference of 6?

Budget: 900.00 seconds | Deadline: 1771778737.56



Unnamed: 0,Attempt,Response Length,Python Calls,Python Errors,Entropy,Answer
0,2,1887,3,0,0.74,5
1,4,2098,2,0,0.861,5
2,8,2030,3,0,0.773,5
3,7,2134,2,0,0.847,5
4,6,2238,1,0,0.803,5



UNANIMOUS ANSWER: 5


[Problem 50 Result]
Model Predicted: 5
Status: ✅ CORRECT



Unnamed: 0,idx,prediction,ground_truth,correct
0,1,336,336,True
1,2,32951,32951,True
2,3,62134,21818,False
3,4,32193,32193,True
4,5,57447,57447,True
5,6,78845,8687,False
6,7,50,50,True
7,8,580,580,True
8,9,520,520,True
9,10,160,160,True


Overall Accuracy: 82.00%
