https://www.kaggle.com/code/ashsihkumar/aimo-logic-solver

In [None]:
# Added -q to silence the long installation text
%pip uninstall -y -q 'keras' 'matplotlib' 'scikit-learn' 'tensorflow'

import warnings
warnings.simplefilter('ignore')

import os
import sys
import subprocess

def set_env(input_archive, temp_dir):
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir, exist_ok=True)
        subprocess.run(['tar', '-xzf', input_archive, '-C', temp_dir], check=True)
    subprocess.run([
        sys.executable, '-m', 'pip', 'install',
        '--no-index', '--find-links', f'{temp_dir}/wheels',
        '-q', 'unsloth', 'trl', 'vllm', 'openai_harmony'
    ], check=True)

set_env(
    input_archive='/kaggle/input/aimo-3-utils/wheels.tar.gz',
    temp_dir='/kaggle/tmp/setup'
)

os.environ['TRANSFORMERS_NO_TF']      = '1'
os.environ['TRANSFORMERS_NO_FLAX']    = '1'
os.environ['CUDA_VISIBLE_DEVICES']    = '0'
os.environ['TOKENIZERS_PARALLELISM']  = 'false'
os.environ['TRITON_PTXAS_PATH']       = '/usr/local/cuda/bin/ptxas'
os.environ['TIKTOKEN_ENCODINGS_BASE'] = '/kaggle/tmp/setup/tiktoken_encodings'

import gc
import re
import math
import time
import queue
import logging
import threading
import traceback
import contextlib
from typing import Optional
from collections import Counter, defaultdict
from concurrent.futures import as_completed, ThreadPoolExecutor

import pandas as pd
import polars as pl
from openai import OpenAI
from jupyter_client import KernelManager
from transformers import set_seed

# --- Modified Logging Section ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
log = logging.getLogger('aimo3')

# Silencing the noisy library logs (Retries and POST requests)
logging.getLogger("openai").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
# --------------------------------

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# CONFIGURATION
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class CFG:
    served_model_name = 'gpt-oss'
    model_path        = '/kaggle/input/gpt-oss-120b/transformers/default/1'
    kv_cache_dtype    = 'fp8_e4m3'
    dtype             = 'auto'

    notebook_limit        = 17400
    high_problem_timeout  = 900
    base_problem_timeout  = 300
    server_timeout        = 180

    session_timeout   = 960
    jupyter_timeout   = 8
    sandbox_timeout   = 5 

    stream_interval   = 200
    context_tokens    = 65536
    buffer_tokens     = 512
    search_tokens     = 32
    top_logprobs      = 5
    batch_size        = 256

    # Attempt split
    tir_fraction    = 0.625
    attempts_hard   = 12
    attempts_medium = 12
    early_stop      = 6
    workers         = 16
    turns           = 128

    # Reflection
    reflect_budget            = 1      
    reflect_min_time_left     = 90     
    reflect_only_on_conflict  = True   

    # Memory guard
    max_logprobs_buf = 2000  

    # Scoring
    entropy_override_ratio     = 2.0  
    entropy_override_min_votes = 2   
    error_penalty_factor       = 0.5
    length_bonus_threshold     = 200  
    length_bonus_min           = 0.5
    verify_bonus_max           = 2.2 

    seed                   = 42
    gpu_memory_utilization = 0.96
    min_p                  = 0.02

    temperatures_by_type = {
        'geometry':      [0.7, 0.8, 0.9, 1.0],
        'number_theory': [0.8, 1.0, 1.2, 1.4],
        'algebra':       [0.8, 1.0, 1.2, 1.4],
        'combinatorics': [1.0, 1.2, 1.4, 1.6],
        'general':       [0.8, 1.0, 1.2, 1.4],
    }
    temperature_verify = 0.2  

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# PROMPTS
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class Prompts:

    TOOL = (
        'Use this tool to execute Python code. '
        'The environment is a stateful Jupyter notebook. '
        'You must use print() to output results.'
    )

    PREFERENCE = (
        'You have access to `math`, `numpy`, `sympy`, `sympy.geometry`, '
        'and `mpmath` (64-digit precision). '
        'Common symbols x,y,z,a,b,c,n,k,r,t are pre-declared. '
        'For geometry: coordinate bash + sympy.geometry. '
        'For number theory: sympy.ntheory (factorint, isprime, totient). '
        'For counting: brute-force small N first, then generalise. '
        'End with a sympy assertion that verifies your answer.'
    )

    TIR = ( # Tool-Integrated Reasoning
        "You are a world-class IMO competitor.\n"
        "1. CATEGORIZE: Identify problem type (algebra/combinatorics/geometry/number-theory).\n"
        "2. PLAN: Mathematical approach + Python verification plan.\n"
        "3. SOLVE: Python with sympy/numpy/mpmath. Print intermediates.\n"
        "4. VERIFY: Separate Python block checking via a different method:\n"
        "   - Geometry   ‚Üí verify constraints numerically\n"
        "   - Counting   ‚Üí brute-force small cases\n"
        "   - Equations  ‚Üí substitute back with sympy.simplify\n"
        "   - Mod arith  ‚Üí check mod two different small primes\n"
        "5. ASSERT: `assert <condition>` in Python ‚Äî if it fails, correct the answer.\n"
        "6. ANSWER: Final non-negative integer in \\boxed{}."
    )

    COT = (
        "You are an expert mathematician competing at IMO level. "
        "Solve using PURE mathematical reasoning ‚Äî no Python code.\n"
        "1. Read twice. Identify the key constraint.\n"
        "2. Reason step-by-step, verifying each deduction.\n"
        "3. Consider edge cases explicitly.\n"
        "4. Only state your final answer when you have high confidence.\n"
        "Final answer in \\boxed{}."
    )

    GEOMETRY_TIR = (
        "You are an expert in competition geometry.\n"
        "MANDATORY APPROACH:\n"
        "1. Immediately place all figures in coordinates.\n"
        "2. Use sympy.geometry (Point, Line, Circle, Triangle) for exact computation.\n"
        "3. Solve with sympy.solve(). Never rely on intuition alone.\n"
        "4. Verify the answer satisfies ALL original constraints numerically.\n"
        "5. `assert <geometric_condition>` before giving the answer.\n"
        "Final integer in \\boxed{}."
    )


    VERIFY = (
        "You are a strict mathematical verifier. NOT a solver.\n"
        "You have been given a candidate answer. Your ONLY job is to check it.\n\n"
        "RULES:\n"
        "- Do NOT re-solve the problem from scratch.\n"
        "- Do NOT re-derive. Only substitute and check.\n"
        "- Write Python that plugs the candidate answer into every problem condition.\n"
        "- with an Each condition must be checked explicit `assert`.\n"
        "- If any `assert` fails: print 'INVALID: <which condition failed>' "
        "and output \\boxed{wrong}.\n"
        "- If all `assert`s pass: print 'VALID' and output \\boxed{candidate_answer}.\n\n"
        "Begin verification now. No planning. No exploration. Assert only."
    )

    @staticmethod
    def verify(problem: str, answer: int) -> str:
        return (
            f"PROBLEM:\n{problem}\n\n"
            f"CANDIDATE ANSWER: {answer}\n\n"
            f"Verify whether {answer} satisfies all conditions using Python assert statements."
        )

    DETERMINISTIC_VERIFY = (
        "You are the final arbiter for a competition math problem.\n"
        "Two candidate answers are in conflict. You must determine which is correct.\n\n"
        "PROBLEM:\n{problem}\n\n"
        "CANDIDATE A: {answer_a}\n"
        "CANDIDATE B: {answer_b}\n\n"
        "INSTRUCTIONS:\n"
        "1. Write Python that tests CANDIDATE A against every problem condition.\n"
        "   Use explicit `assert` for each condition. Print 'A:PASS' or 'A:FAIL:<reason>'.\n"
        "2. Write Python that tests CANDIDATE B against every problem condition.\n"
        "   Use explicit `assert` for each condition. Print 'B:PASS' or 'B:FAIL:<reason>'.\n"
        "3. Output ONLY \\boxed{{winning_candidate}} ‚Äî the integer that passed.\n"
        "   If both fail, output \\boxed{{0}}.\n"
        "   If both pass, output the one with stronger verification.\n\n"
        "No re-solving. No derivation. Substitution and assertion only."
    )

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# PROBLEM CLASSIFIER
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

GEOMETRY_KEYWORDS = {
    'triangle', 'circle', 'angle', 'perpendicular', 'tangent',
    'polygon', 'quadrilateral', 'circumscribe', 'inscribe',
    'chord', 'radius', 'area', 'perimeter', 'segment', 'altitude',
    'median', 'bisector', 'cyclic', 'collinear', 'concurrent',
    'hexagon', 'pentagon', 'diagonal', 'circumradius', 'inradius',
    'orthocenter', 'centroid', 'incircle', 'excircle',
}

NUMBER_THEORY_KEYWORDS = {
    'prime', 'divisib', 'modulo', 'remainder', 'diophantine',
    'gcd', 'lcm', 'coprime', 'totient', 'congruent', 'factor',
}

ALGEBRA_KEYWORDS = {
    'functional equation', 'polynomial', 'root', 'coefficient',
    'sequence', 'series', 'inequality', 'maximum', 'minimum',
}

COMBINATORICS_KEYWORDS = {
    'count', 'permut', 'combin', 'graph', 'arrangement',
    'subset', 'partition', 'bijection', 'coloring', 'path',
    'tournament', 'alice', 'bob', 'game',
}

HARD_KEYWORDS = (
    GEOMETRY_KEYWORDS | NUMBER_THEORY_KEYWORDS |
    ALGEBRA_KEYWORDS | COMBINATORICS_KEYWORDS
)


def detect_problem_type(problem: str) -> str:
    p = problem.lower()
    scores = {
        'geometry':      sum(k in p for k in GEOMETRY_KEYWORDS),
        'number_theory': sum(k in p for k in NUMBER_THEORY_KEYWORDS),
        'algebra':       sum(k in p for k in ALGEBRA_KEYWORDS),
        'combinatorics': sum(k in p for k in COMBINATORICS_KEYWORDS),
    }
    best = max(scores, key=scores.get)
    return best if scores[best] > 0 else 'general'


def estimate_difficulty(problem: str) -> str:
    p = problem.lower()
    hard_hits = sum(k in p for k in HARD_KEYWORDS)
    if len(problem) > 400 or hard_hits >= 2:
        return 'hard'
    return 'medium'


def select_system_prompt(ptype: str, is_tir: bool) -> str:
    if ptype == 'geometry' and is_tir:
        return Prompts.GEOMETRY_TIR   
    if not is_tir:
        return Prompts.COT            
    return Prompts.TIR                
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# ERROR CATEGORIZER
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class ErrorKind:
    NONE    = 'none'
    TIMEOUT = 'timeout'
    SYNTAX  = 'syntax'
    RUNTIME = 'runtime'
    OTHER   = 'other'


def categorize_error(output: str) -> str:
    if '[ERROR] Timed out' in output:
        return ErrorKind.TIMEOUT
    if 'SyntaxError' in output:
        return ErrorKind.SYNTAX
    if 'Traceback' in output or 'Error:' in output:
        return ErrorKind.RUNTIME
    if output.startswith('[ERROR]'):
        return ErrorKind.OTHER
    return ErrorKind.NONE

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# SANDBOX
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class Sandbox:

    _port_lock = threading.Lock()
    _next_port = 50000

    @classmethod
    def _get_ports(cls, n=5):
        with cls._port_lock:
            ports = list(range(cls._next_port, cls._next_port + n))
            cls._next_port += n
        return ports

    def __init__(self, timeout: float):
        self._timeout = timeout
        self._km      = None
        self._client  = None

        ports = self._get_ports()
        env   = os.environ.copy()
        env.update({
            'PYDEVD_DISABLE_FILE_VALIDATION': '1',
            'PYDEVD_WARN_EVALUATION_TIMEOUT': '0',
            'JUPYTER_PLATFORM_DIRS':          '1',
            'PYTHONWARNINGS':                 'ignore',
            'MPLBACKEND':                     'Agg',
        })

        self._km = KernelManager()
        self._km.shell_port   = ports[0]
        self._km.iopub_port   = ports[1]
        self._km.stdin_port   = ports[2]
        self._km.hb_port      = ports[3]
        self._km.control_port = ports[4]
        self._km.start_kernel(env=env, extra_arguments=['--Application.log_level=CRITICAL'])

        self._client = self._km.blocking_client()
        self._client.start_channels()
        self._client.wait_for_ready(timeout=self._timeout)

        self._preload()

    def _preload(self):
        self.execute(
            'import math, numpy, sympy, itertools, collections, mpmath\n'
            'import sympy.ntheory as nt\n'
            'from sympy import *\n'
            'from sympy.geometry import *\n'
            'mpmath.mp.dps = 64\n'
            'x,y,z,a,b,c,n,k,r,t = symbols("x y z a b c n k r t", real=True)\n'
        )

    def _clean_tb(self, tb: list) -> str:
        lines = []
        for frame in tb:
            f = re.sub(r'\x1b\[[0-9;]*m', '', frame)
            if 'File "' in f and 'ipython-input' not in f:
                continue
            lines.append(f)
        return ''.join(lines)

    def execute(self, code: str, timeout: float | None = None) -> str:
        client      = self._client
        eff_timeout = timeout or self._timeout
        msg_id      = client.execute(code, store_history=True, allow_stdin=False, stop_on_error=False)

        stdout, stderr = [], []
        start = time.time()

        while True:
            if time.time() - start > eff_timeout:
                self._km.interrupt_kernel()
                return f'[ERROR] Timed out after {eff_timeout}s'
            try:
                msg = client.get_iopub_msg(timeout=1.0)
            except queue.Empty:
                continue

            if msg.get('parent_header', {}).get('msg_id') != msg_id:
                continue

            mtype   = msg.get('msg_type')
            content = msg.get('content', {})

            if mtype == 'stream':
                target = stdout if content.get('name') == 'stdout' else stderr
                target.append(content.get('text', ''))
            elif mtype == 'error':
                stderr.append(self._clean_tb(content.get('traceback', [])))
            elif mtype in {'execute_result', 'display_data'}:
                text = content.get('data', {}).get('text/plain')
                if text:
                    stdout.append(text if text.endswith('\n') else text + '\n')
            elif mtype == 'status' and content.get('execution_state') == 'idle':
                break

        out = ''.join(stdout)
        err = ''.join(stderr)
        if err:
            return f'{out.rstrip()}\n{err}' if out else err
        return out if out.strip() else '[WARN] No output. Use print().'

    def reset(self):
        self.execute('%reset -f')
        self._preload()

    def close(self):
        with contextlib.suppress(Exception):
            if self._client:
                self._client.stop_channels()
        if self._km:
            with contextlib.suppress(Exception):
                self._km.shutdown_kernel(now=True)
            with contextlib.suppress(Exception):
                self._km.cleanup_resources()

    def __del__(self):
        self.close()

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# OPENAI-HARMONY IMPORTS
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

from openai_harmony import (
    HarmonyEncodingName, load_harmony_encoding,
    SystemContent, ReasoningEffort, ToolNamespaceConfig,
    Author, Message, Role, TextContent, Conversation,
)


class AIMO3Template:
    """
    Owns all openai_harmony formatting logic.

    Centralises the contract between our prompts and the harmony
    message format so no call site has to know about SystemContent,
    ReasoningEffort, or ToolNamespaceConfig construction details.

    IMPORTANT: tool_config is embedded into the Conversation via
    SystemContent.with_tools() at build time. Do NOT pass it again
    as a third argument to render_conversation_for_completion().
    """

    def __init__(self, tool_prompt: str):
        self._tool_config = ToolNamespaceConfig(
            name='python',
            description=tool_prompt,
            tools=[],
        )

    @property
    def tool_config(self) -> ToolNamespaceConfig:
        """The ToolNamespaceConfig embedded in the system message."""
        return self._tool_config

    def build_conversation(
        self,
        system_prompt: str,
        user_prompt:   str,
    ) -> Conversation:
        """
        Returns a ready-to-render Conversation from a system + user prompt pair.
        Applies HIGH reasoning effort and attaches the python tool namespace.
        """
        messages = self._make_messages(system_prompt, user_prompt)
        return Conversation.from_messages(messages)

    def make_tool_response(
        self,
        output:  str,
        channel: str | None = None,
    ) -> Message:
        """Wraps a Python kernel output string into a TOOL-role Message."""
        msg = Message(
            author=Author(role=Role.TOOL, name='python'),
            content=[TextContent(text=output)],
        ).with_recipient('assistant')
        return msg.with_channel(channel) if channel else msg

    def _make_system_content(self, system_prompt: str) -> SystemContent:
        return (
            SystemContent.new()
            .with_model_identity(system_prompt)
            .with_reasoning_effort(ReasoningEffort.HIGH)
            .with_tools(self._tool_config)
        )

    def _make_messages(self, system_prompt: str, user_prompt: str) -> list[Message]:
        system_content = self._make_system_content(system_prompt)
        return [
            Message.from_role_and_content(Role.SYSTEM, system_content),
            Message.from_role_and_content(Role.USER,   user_prompt),
        ]

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# TOOL EXECUTOR
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class Tool:

    def __init__(self, sandbox: Sandbox, template: 'AIMO3Template'):
        self._sandbox  = sandbox
        self._template = template
        self._lock     = threading.Lock()

    @staticmethod
    def _ensure_print(code: str) -> str:
        lines = code.strip().split('\n')
        if not lines:
            return code
        last = lines[-1].strip()
        if any(kw in last for kw in ('print', 'import', '#')) or not last:
            return code
        lines[-1] = f'print({last})'
        return '\n'.join(lines)

    def call(self, message: Message) -> tuple[list[Message], str]:
        """
        Execute code from a model tool-call message.
        Returns (response_messages, error_kind).
        Uses the template to build the response message so all
        openai_harmony formatting stays in one place.
        """
        code = self._ensure_print(message.content[0].text)
        with self._lock:
            try:
                output = self._sandbox.execute(code)
            except Exception as exc:
                log.warning('Tool execution exception: %s', exc)
                output = f'[ERROR] {exc}'
        kind     = categorize_error(output)
        response = self._template.make_tool_response(output, channel=message.channel)
        return [response], kind

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# ANSWER EXTRACTION
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def scan_for_answer(text: str) -> int | None:
    for pattern in [
        r'\\boxed\s*\{\s*([0-9,]+)\s*\}',
        r'final\s+answer\s+is\s*([0-9,]+)',
        r'answer\s*[=:]\s*([0-9,]+)',
    ]:
        for match in reversed(re.findall(pattern, text, re.IGNORECASE)):
            try:
                val = int(match.replace(',', ''))
                if 0 <= val <= 99999:
                    return val
            except ValueError:
                continue
    return None

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# RESULT DATACLASS
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class AttemptResult:
    """Holds everything about one attempt including verification score."""

    def __init__(self):
        self.answer:          int | None = None
        self.response_length: int        = 0
        self.python_calls:    int        = 0
        self.python_errors:   int        = 0
        self.error_kinds:     list       = []   # list of ErrorKind strings
        self.entropy:         float      = float('inf')
        self.verify_score:    float      = 0.0  # 0.0‚Äì1.0
        self.prompt_type:     str        = 'TIR'
        self.attempt_idx:     int        = 0
        self.is_reflection:   bool       = False

    def to_dict(self) -> dict:
        return {
            'Attempt':     self.attempt_idx,
            'Type':        self.prompt_type,   # VERIFY / DET_VERIFY / TIR / GEO / COT
            'Answer':      self.answer,        # kept as int | None; cast to Int64 at display time
            'PyErr':       self.python_errors,
            'Tokens':      self.response_length,
            'Entropy':     round(self.entropy, 3),
            'VerifyScore': round(self.verify_score, 2),
        }

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# REFLECTION MANAGER
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class ReflectionManager:
    """Tracks reflection budget per problem."""

    def __init__(self, budget: int):
        self._budget = budget
        self._used   = 0
        self._lock   = threading.Lock()

    def can_reflect(self) -> bool:
        with self._lock:
            return self._used < self._budget

    def consume(self) -> bool:
        with self._lock:
            if self._used < self._budget:
                self._used += 1
                return True
            return False

    def reset(self):
        with self._lock:
            self._used = 0

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# CORE ATTEMPT RUNNER
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def _compute_confidence_proxy(logprobs_buf: list) -> float:
    """
    Confidence proxy using peak logprob rather than truncated Shannon entropy.

    Why peak logprob instead of Shannon entropy:
      - top_logprobs only covers 5 tokens ‚Äî the tail mass is missing.
      - Partial-distribution Shannon entropy is systematically underestimated,
        making high-confidence tokens appear less certain than they are.
      - Peak logprob (= log P(most likely token)) is stable regardless of
        truncation because it never depends on the missing tail.
      - We return the MEAN peak logprob across all tokens, then negate it
        so the result behaves like entropy: lower = more confident.

    Returns: mean(-max_logprob) across tokens.
             Lower value  ‚Üí more confident ‚Üí higher weight in scoring.
             float('inf') when no logprobs available.
    """
    if not logprobs_buf:
        return float('inf')

    total, count = 0.0, 0
    for lp_dict in logprobs_buf:
        if not isinstance(lp_dict, dict) or not lp_dict:
            continue
        peak = max(lp_dict.values())   
        total += peak
        count += 1

    if count == 0:
        return float('inf')

    mean_peak = total / count
    return -mean_peak  


def run_attempt(
    client,
    encoding,
    stop_token_ids,
    cfg:           CFG,
    template:      AIMO3Template,
    tool:          Tool,
    system_prompt: str,
    user_prompt:   str,
    attempt_seed:  int,
    stop_event:    threading.Event,
    deadline:      float,
    prompt_type:   str        = 'TIR',
    attempt_idx:   int        = 0,
    is_reflection: bool       = False,
    temperature:   float | None = None,
) -> AttemptResult:

    result               = AttemptResult()
    result.prompt_type   = prompt_type
    result.attempt_idx   = attempt_idx
    result.is_reflection = is_reflection

    effective_temp = temperature if temperature is not None else cfg.temperatures_by_type['general'][0]

    logprobs_buf: list = []

    conversation = template.build_conversation(system_prompt, user_prompt)

    try:
        for _ in range(cfg.turns):
            if stop_event.is_set() or time.time() > deadline:
                break

            # FIX 1: 2 args only ‚Äî tool_config already baked into conversation
            prompt_ids = encoding.render_conversation_for_completion(
                conversation, Role.ASSISTANT
            )
            max_tokens = cfg.context_tokens - len(prompt_ids)
            if max_tokens < cfg.buffer_tokens:
                log.debug('Context full at attempt %d', attempt_idx)
                break

            stream = client.completions.create(
                model=cfg.served_model_name,
                temperature=effective_temp,
                logprobs=cfg.top_logprobs,
                max_tokens=max_tokens,
                prompt=prompt_ids,
                seed=attempt_seed,
                stream=True,
                extra_body={
                    'min_p':            cfg.min_p,
                    'stop_token_ids':   stop_token_ids,
                    'return_token_ids': True,
                },
            )

            token_buf   = []
            text_chunks = []

            try:
                for chunk in stream:
                    if stop_event.is_set() or time.time() > deadline:
                        break

                    new_tokens = chunk.choices[0].token_ids
                    new_text   = chunk.choices[0].text

                    if new_tokens:
                        token_buf.extend(new_tokens)
                        result.response_length += len(new_tokens)
                        text_chunks.append(new_text)

                        lp = chunk.choices[0].logprobs
                        if lp and lp.top_logprobs:
                            logprobs_buf.extend(lp.top_logprobs)
                            # Memory guard: cap buffer
                            if len(logprobs_buf) > cfg.max_logprobs_buf:
                                logprobs_buf = logprobs_buf[-cfg.max_logprobs_buf:]

                    if '}' in new_text:
                        search = ''.join(text_chunks[-cfg.search_tokens:])
                        ans    = scan_for_answer(search)
                        if ans is not None:
                            result.answer = ans
                            break
            finally:
                stream.close()

            if result.answer is not None:
                break
            if not token_buf:
                break

            try:
                new_msgs = encoding.parse_messages_from_completion_tokens(token_buf, Role.ASSISTANT)
            except Exception as exc:
                log.warning('Attempt %d: parse error (%s), stopping turn', attempt_idx, exc)
                break

            conversation.messages.extend(new_msgs)
            last_msg = new_msgs[-1]

            if last_msg.channel == 'final':
                result.answer = scan_for_answer(last_msg.content[0].text)
                break

            if last_msg.recipient == 'python':
                result.python_calls += 1
                responses, err_kind  = tool.call(last_msg)

                if err_kind != ErrorKind.NONE:
                    result.python_errors += 1
                    result.error_kinds.append(err_kind)

                conversation.messages.extend(responses)

    except TimeoutError as exc:
        log.warning('Attempt %d timed out: %s', attempt_idx, exc)
        result.error_kinds.append(ErrorKind.TIMEOUT)

    except ConnectionError as exc:
        log.error('Attempt %d connection error: %s', attempt_idx, exc)
        result.error_kinds.append(ErrorKind.OTHER)

    except Exception as exc:
        # Last-resort catch ‚Äî always log with traceback for debugging
        log.error('Attempt %d unexpected error:\n%s', attempt_idx, traceback.format_exc())
        result.error_kinds.append(ErrorKind.OTHER)

    result.entropy = _compute_confidence_proxy(logprobs_buf)
    return result

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# REFLECTION RUNNER
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def run_reflection_attempt(
    client, encoding, stop_token_ids, cfg, template: AIMO3Template, tool,
    original_problem: str,
    candidate_answer: int,
    stop_event: threading.Event,
    deadline:   float,
) -> AttemptResult:
    """
    Strict verification pass ‚Äî substitution only, no re-solving.
    Uses Prompts.VERIFY (not TIR) and lower temperature to reduce exploration.
    """
    result = run_attempt(
        client, encoding, stop_token_ids, cfg,
        template, tool,
        system_prompt = Prompts.VERIFY,          # strict verifier, not TIR
        user_prompt   = Prompts.verify(original_problem, candidate_answer),
        attempt_seed  = 999_999,
        stop_event    = stop_event,
        deadline      = deadline,
        prompt_type   = 'VERIFY',
        attempt_idx   = 99,
        is_reflection = True,
        temperature   = 0.4,                    
    )

    if result.answer is not None:
        if result.answer == candidate_answer:
            # Passed all assertions ‚Üí high verify_score
            result.verify_score = min(1.0, 0.7 + (1.0 / max(result.entropy, 0.1)) * 0.05)
        else:

            result.verify_score = 0.35

    return result


def run_deterministic_verifier(
    client, encoding, stop_token_ids, cfg, template: AIMO3Template, tool,
    problem:    str,
    answer_a:   int,
    answer_b:   int,
    stop_event: threading.Event,
    deadline:   float,
) -> int | None:
    """
    Final arbiter at temperature=0.2.
    Given two conflicting candidates, runs strict assertion-based verification
    of both and returns the winner, or None if indeterminate.
    Costs ~30-60s but resolves genuine conflicts reliably.
    """
    user_prompt = Prompts.DETERMINISTIC_VERIFY.format(
        problem=problem,
        answer_a=answer_a,
        answer_b=answer_b,
    )
    result = run_attempt(
        client, encoding, stop_token_ids, cfg,
        template, tool,
        system_prompt = Prompts.VERIFY,
        user_prompt   = user_prompt,
        attempt_seed  = 777_777,
        stop_event    = stop_event,
        deadline      = deadline,
        prompt_type   = 'DET_VERIFY',
        attempt_idx   = 98,
        is_reflection = True,
        temperature   = cfg.temperature_verify,   # 0.2 ‚Äî nearly deterministic
    )

    if result.answer in (answer_a, answer_b):
        log.info('Deterministic verifier chose: %d', result.answer)
        return result.answer

    log.warning('Deterministic verifier returned unexpected answer: %s', result.answer)
    return None

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# ANSWER SELECTION
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def select_answer(results: list[AttemptResult], cfg: CFG) -> int:

    answer_weights = defaultdict(float)
    answer_votes   = defaultdict(int)
    answer_min_ent = {}

    for r in results:
        if r.answer is None:
            continue  

        weight = 1.0 / max(r.entropy, 1e-9)

        runtime_errors = r.error_kinds.count(ErrorKind.RUNTIME)
        timeout_errors = r.error_kinds.count(ErrorKind.TIMEOUT)
        syntax_errors  = r.error_kinds.count(ErrorKind.SYNTAX)
        error_load = runtime_errors + timeout_errors * 1.5 + syntax_errors * 0.5
        weight *= 1.0 / (1 + error_load * cfg.error_penalty_factor)

        if r.response_length < cfg.length_bonus_threshold:
            length_factor = max(
                cfg.length_bonus_min,
                r.response_length / cfg.length_bonus_threshold,
            )
            weight *= length_factor

        if r.verify_score > 0:
            verify_mult = 1.0 + r.verify_score * (cfg.verify_bonus_max - 1.0)
            weight *= verify_mult

        answer_weights[r.answer] += weight
        answer_votes[r.answer]   += 1

        if r.answer not in answer_min_ent or r.entropy < answer_min_ent[r.answer]:
            answer_min_ent[r.answer] = r.entropy

    if not answer_weights:
        log.warning('No valid answers from any attempt. Returning 0.')
        return 0


    majority_ans = max(answer_votes,   key=answer_votes.get)
    best_ent_ans = min(answer_min_ent, key=answer_min_ent.get)

    if best_ent_ans != majority_ans:
        minority_votes = answer_votes[best_ent_ans]
        maj_ent        = answer_min_ent.get(majority_ans, float('inf'))
        min_ent        = answer_min_ent[best_ent_ans]
        vote_qualified = minority_votes >= cfg.entropy_override_min_votes
        conf_qualified = min_ent * cfg.entropy_override_ratio < maj_ent

        if vote_qualified and conf_qualified:
            log.info(
                'Entropy override: %d (proxy=%.3f, votes=%d) ‚Üí %d (proxy=%.3f, votes=%d)',
                majority_ans, maj_ent, answer_votes[majority_ans],
                best_ent_ans, min_ent, minority_votes,
            )
            final = best_ent_ans
        else:
            final = max(answer_weights, key=answer_weights.get)
    else:
        final = max(answer_weights, key=answer_weights.get)

    rows = []
    for ans, w in sorted(answer_weights.items(), key=lambda x: -x[1]):
        rows.append({
            'Answer':     ans,
            'Votes':      answer_votes[ans],
            'Score':      round(w, 3),
            'MinEntropy': round(answer_min_ent[ans], 3),
        })
    print(pd.DataFrame(rows).to_string(index=False))
    print(f'\nFinal Answer: {final}\n')
    return final

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# MAIN SOLVER
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

class Solver:

    def __init__(self, cfg: CFG, port: int = 8000):
        self.cfg  = cfg
        self.port = port

        self._preload_weights()
        self.server_process = self._start_server()

        self.client = OpenAI(
            base_url=f'http://0.0.0.0:{port}/v1',
            api_key='sk-local',
            timeout=cfg.session_timeout,
        )
        self._wait_for_server()

        self.encoding       = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
        self.stop_token_ids = self.encoding.stop_tokens_for_assistant_actions()
        self.template       = AIMO3Template(Prompts.TOOL)   # single template instance

        self._init_kernels()
        self.notebook_start = time.time()
        self.problems_left  = 50

    # ‚îÄ‚îÄ server ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

    def _preload_weights(self):
        log.info('Preloading model weights...')
        t0 = time.time()
        files, total = [], 0
        for root, _, fnames in os.walk(self.cfg.model_path):
            for f in fnames:
                p = os.path.join(root, f)
                if os.path.isfile(p):
                    files.append(p)
                    total += os.path.getsize(p)

        def _read(path):
            with open(path, 'rb') as fh:
                while fh.read(1 << 30):
                    pass

        with ThreadPoolExecutor(max_workers=self.cfg.workers) as ex:
            list(ex.map(_read, files))
        log.info('Preloaded %.2f GB in %.1fs', total / 1e9, time.time() - t0)

    def _start_server(self):
        cmd = [
            sys.executable, '-m', 'vllm.entrypoints.openai.api_server',
            '--seed',                   str(self.cfg.seed),
            '--model',                  self.cfg.model_path,
            '--served-model-name',      self.cfg.served_model_name,
            '--tensor-parallel-size',   '1',
            '--max-num-seqs',           str(self.cfg.batch_size),
            '--gpu-memory-utilization', str(self.cfg.gpu_memory_utilization),
            '--host',                   '0.0.0.0',
            '--port',                   str(self.port),
            '--dtype',                  self.cfg.dtype,
            '--kv-cache-dtype',         self.cfg.kv_cache_dtype,
            '--max-model-len',          str(self.cfg.context_tokens),
            '--stream-interval',        str(self.cfg.stream_interval),
            '--async-scheduling',
            '--disable-log-stats',
            '--enable-prefix-caching',
        ]
        self.log_file = open('vllm_server.log', 'w')
        return subprocess.Popen(
            cmd, stdout=self.log_file, stderr=subprocess.STDOUT, start_new_session=True,
        )

    def _wait_for_server(self):
        log.info('Waiting for vLLM server...')
        t0 = time.time()
        for _ in range(self.cfg.server_timeout):
            if self.server_process.poll() is not None:
                raise RuntimeError('vLLM server died. Check vllm_server.log')
            try:
                self.client.models.list()
                log.info('Server ready in %.1fs', time.time() - t0)
                return
            except Exception:
                time.sleep(1)
        raise RuntimeError('Server start timeout')

    def _init_kernels(self):
        log.info('Starting %d Jupyter kernels...', self.cfg.workers)
        t0 = time.time()
        self.sandbox_pool = queue.Queue()

        def _mk():
            return Sandbox(self.cfg.jupyter_timeout)

        with ThreadPoolExecutor(max_workers=self.cfg.workers) as ex:
            futures = [ex.submit(_mk) for _ in range(self.cfg.workers)]
            for f in as_completed(futures):
                try:
                    self.sandbox_pool.put(f.result())
                except Exception as exc:
                    log.error('Kernel init failed: %s', exc)

        log.info('Kernels ready in %.1fs', time.time() - t0)

    # ‚îÄ‚îÄ budget ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

    def _compute_budget(self) -> float:
        elapsed   = time.time() - self.notebook_start
        remaining = self.cfg.notebook_limit - elapsed
        reserved  = max(0, self.problems_left - 1) * self.cfg.base_problem_timeout
        budget    = min(remaining - reserved, self.cfg.high_problem_timeout)
        return max(budget, self.cfg.base_problem_timeout)

    # ‚îÄ‚îÄ dispatch one attempt ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

    def _dispatch(
        self, problem: str, system_prompt: str, prompt_type: str,
        attempt_idx: int, ptype: str,
        stop_event: threading.Event, deadline: float,
    ) -> AttemptResult:

        empty = AttemptResult()
        empty.attempt_idx = attempt_idx
        empty.prompt_type = prompt_type

        if stop_event.is_set() or time.time() > deadline:
            return empty

        sandbox = None
        try:
            sandbox = self.sandbox_pool.get(timeout=self.cfg.sandbox_timeout)
            tool    = Tool(sandbox, self.template)
            seed    = int((self.cfg.seed + attempt_idx) ** 2)

            # Type-aware temperature: round-robin within the schedule for this problem type.
            schedule = self.cfg.temperatures_by_type.get(ptype, self.cfg.temperatures_by_type['general'])
            temp     = schedule[attempt_idx % len(schedule)]

            return run_attempt(
                self.client, self.encoding, self.stop_token_ids,
                self.cfg, self.template, tool,
                system_prompt = system_prompt,
                user_prompt   = f'{problem}\n\n{Prompts.PREFERENCE}',
                attempt_seed  = seed,
                stop_event    = stop_event,
                deadline      = deadline,
                prompt_type   = prompt_type,
                attempt_idx   = attempt_idx,
                temperature   = temp,
            )

        except queue.Empty:
            log.warning('Sandbox pool empty at attempt %d ‚Äî skipping', attempt_idx)
            return empty

        except Exception:
            log.error('Dispatch error at attempt %d:\n%s', attempt_idx, traceback.format_exc())
            return empty

        finally:
            if sandbox is not None:
                try:
                    sandbox.reset()
                except Exception:
                    pass
                self.sandbox_pool.put(sandbox)

    # ‚îÄ‚îÄ main solve ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

    def solve(self, problem: str) -> int:
        print(f'\n{"="*60}')
        short = problem[:120] + '...' if len(problem) > 120 else problem
        print(f'Problem: {short}')

        ptype      = detect_problem_type(problem)
        difficulty = estimate_difficulty(problem)
        budget     = self._compute_budget()
        deadline   = time.time() + budget

        total = self.cfg.attempts_hard if difficulty == 'hard' else self.cfg.attempts_medium
        n_tir = int(total * self.cfg.tir_fraction)
        n_cot = total - n_tir

        print(f'Type={ptype} | Difficulty={difficulty} | Budget={budget:.0f}s | '
              f'{n_tir} TIR + {n_cot} COT\n')

        # Build task list
        tasks = []
        for i in range(n_tir):
            sp = select_system_prompt(ptype, is_tir=True)
            tasks.append((sp, 'TIR' if ptype != 'geometry' else 'GEO', i))
        for i in range(n_cot):
            tasks.append((Prompts.COT, 'COT', n_tir + i))

        results:       list[AttemptResult] = []
        valid_answers: list[int]           = []
        stop_event  = threading.Event()
        reflect_mgr = ReflectionManager(self.cfg.reflect_budget)

        executor = ThreadPoolExecutor(max_workers=self.cfg.workers)
        try:
            futures = [
                executor.submit(self._dispatch, problem, sp, ptype_label, idx, ptype, stop_event, deadline)
                for sp, ptype_label, idx in tasks
            ]

            for future in as_completed(futures):
                try:
                    r = future.result()
                    results.append(r)

                    if r.answer is not None:
                        valid_answers.append(r.answer)

              
                    counts = Counter(valid_answers).most_common(1)
                    if counts and counts[0][1] >= self.cfg.early_stop:
                        leading, vote_count = counts[0]

                        tir_support = any(
                            res.answer == leading and res.prompt_type in ('TIR', 'GEO')
                            for res in results
                        )

                        if tir_support:
                            log.info(
                                'Early stop: answer=%d with %d votes (TIR-backed)',
                                leading, vote_count,
                            )
                            stop_event.set()
                            for f in futures:
                                f.cancel()
                            break
                        else:
                            log.info(
                                'Early stop suppressed: answer=%d has %d votes but no TIR support',
                                leading, vote_count,
                            )

                except Exception:
                    log.error('Future result error:\n%s', traceback.format_exc())

        finally:
            stop_event.set()
            executor.shutdown(wait=True, cancel_futures=True)
            self.problems_left = max(0, self.problems_left - 1)

        if results:
            df = pd.DataFrame([r.to_dict() for r in results])
            df['Answer'] = df['Answer'].astype('Int64')
            print(df.to_string(index=False))

        if not valid_answers:
            log.warning('No valid answers found. Returning 0.')
            return 0

        # ‚îÄ‚îÄ Reflection pass ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        unique_answers = set(valid_answers)
        time_left      = deadline - time.time()

        run_reflect = (
            reflect_mgr.can_reflect()
            and time_left > self.cfg.reflect_min_time_left
            and (not self.cfg.reflect_only_on_conflict or len(unique_answers) > 1)
        )

        if run_reflect and reflect_mgr.consume():
            temp_weights: dict[int, float] = defaultdict(float)
            for r in results:
                if r.answer is not None:
                    temp_weights[r.answer] += 1.0 / max(r.entropy, 1e-9)

            top_candidates  = sorted(temp_weights, key=temp_weights.get, reverse=True)
            reflect_targets = top_candidates[:2] if len(unique_answers) > 1 else top_candidates[:1]

            if len(reflect_targets) == 2 and time_left < self.cfg.reflect_min_time_left * 2:
                reflect_targets = reflect_targets[:1]

            log.info('Reflecting on candidates: %s (%ds left)', reflect_targets, int(time_left))

            reflect_results: dict[int, AttemptResult] = {}

            for candidate in reflect_targets:
                sandbox = None
                try:
                    sandbox     = self.sandbox_pool.get(timeout=self.cfg.sandbox_timeout)
                    tool        = Tool(sandbox, self.template)
                    reflect_stp = threading.Event()

                    r_result = run_reflection_attempt(
                        self.client, self.encoding, self.stop_token_ids, self.cfg,
                        self.template, tool,
                        original_problem = problem,
                        candidate_answer = candidate,
                        stop_event       = reflect_stp,
                        deadline         = deadline,
                    )
                    reflect_results[candidate] = r_result
                    results.append(r_result)

                    if r_result.answer == candidate:
                        for res in results:
                            if res.answer == candidate and not res.is_reflection:
                                res.verify_score = max(res.verify_score, 0.5)
                        log.info('Reflection CONFIRMED: %d (verify_score=%.2f)', candidate, r_result.verify_score)
                    elif r_result.answer is not None:
                        log.info('Reflection CHANGED: %d ‚Üí %d', candidate, r_result.answer)

                except queue.Empty:
                    log.warning('No sandbox for reflection of candidate=%d', candidate)
                except Exception:
                    log.error('Reflection error for candidate=%d:\n%s', candidate, traceback.format_exc())
                finally:
                    if sandbox is not None:
                        try:
                            sandbox.reset()
                        except Exception:
                            pass
                        self.sandbox_pool.put(sandbox)

        # ‚îÄ‚îÄ Deterministic verifier (top-2 conflict only) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
        time_left = deadline - time.time()
        weighted: dict[int, float] = defaultdict(float)
        for r in results:
            if r.answer is not None:
                weighted[r.answer] += 1.0 / max(r.entropy, 1e-9)

        if len(weighted) >= 2 and time_left > 45:
            top2    = sorted(weighted, key=weighted.get, reverse=True)[:2]
            gap     = weighted[top2[0]] - weighted[top2[1]]
            total_w = sum(weighted.values())

            # Only run if gap is less than 20% of total weight ‚Äî genuinely close
            if gap / max(total_w, 1e-9) < 0.20:
                log.info(
                    'Close conflict (gap=%.1f%%) ‚Äî running deterministic verifier on %s',
                    gap / total_w * 100, top2,
                )
                sandbox = None
                try:
                    sandbox  = self.sandbox_pool.get(timeout=self.cfg.sandbox_timeout)
                    tool     = Tool(sandbox, self.template)
                    det_stop = threading.Event()
                    winner   = run_deterministic_verifier(
                        self.client, self.encoding, self.stop_token_ids, self.cfg,
                        self.template, tool,
                        problem=problem,
                        answer_a=top2[0],
                        answer_b=top2[1],
                        stop_event=det_stop,
                        deadline=deadline,
                    )
                    if winner is not None:
                        for r in results:
                            if r.answer == winner and not r.is_reflection:
                                r.verify_score = max(r.verify_score, 0.9)
                        log.info('Deterministic verifier boosted: %d', winner)
                except queue.Empty:
                    log.warning('No sandbox for deterministic verifier')
                except Exception:
                    log.error('Deterministic verifier error:\n%s', traceback.format_exc())
                finally:
                    if sandbox is not None:
                        try:
                            sandbox.reset()
                        except Exception:
                            pass
                        self.sandbox_pool.put(sandbox)

        return select_answer(results, self.cfg)

    # ‚îÄ‚îÄ cleanup ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

    def __del__(self):
        if hasattr(self, 'server_process'):
            with contextlib.suppress(Exception):
                self.server_process.terminate()
                self.server_process.wait()
        if hasattr(self, 'log_file'):
            with contextlib.suppress(Exception):
                self.log_file.close()
        if hasattr(self, 'sandbox_pool'):
            while not self.sandbox_pool.empty():
                with contextlib.suppress(Exception):
                    self.sandbox_pool.get_nowait().close()

# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# ENTRY POINT
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

set_seed(CFG.seed)
solver = Solver(CFG)


def predict(
    id_: pl.DataFrame,
    question: pl.DataFrame,
    answer: Optional[pl.DataFrame] = None,
) -> pl.DataFrame:

    id_value      = id_.item(0)
    question_text = question.item(0)

    gc.disable()
    final_answer = solver.solve(question_text)
    gc.enable()
    gc.collect()

    return pl.DataFrame({'id': id_value, 'answer': final_answer})


import kaggle_evaluation.aimo_3_inference_server
inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        ('/kaggle/input/ai-mathematical-olympiad-progress-prize-3/test.csv',)
    )

Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyldavis 3.4.1 requires scikit-learn>=1.0.0, which is not installed.
ydata-profiling 4.18.1 requires matplotlib<=3.10,>=3.5, which is not installed.
stable-baselines3 2.1.0 requires matplotlib, which is not installed.
sentence-transformers 5.1.1 requires scikit-learn, which is not installed.
librosa 0.11.0 requires scikit-learn>=1.1.0, which is not installed.
cuml-cu12 25.6.0 requires scikit-learn>=1.5, which is not installed.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
bigframes 2.26.0 requires matplotlib>=3.7.1, which is not installed.
arviz 0.22.0 requires matplotlib>=3.8, which is not installed.
pynndescent 0.5.13 requires scikit-learn>=0.18, which is not installed.
shap 0.49.1 requires scikit-learn, which is not installed.
fastai 2.8.4 requires matplotlib, w


Problem: Solve $4+x=4$ for $x$.
Type=general | Difficulty=medium | Budget=900s | 7 TIR + 5 COT



2026-02-21 08:53:46,670 [INFO] Early stop: answer=0 with 6 votes (TIR-backed)


 Attempt Type  Answer  PyErr  Tokens  Entropy  VerifyScore
       0  TIR       0      0     201    0.423          0.0
       5  TIR       0      0     201    0.564          0.0
       4  TIR       0      0     201    0.479          0.0
       9  COT       0      0     201    0.380          0.0
       6  TIR       0      0     201    0.503          0.0
       3  TIR       0      0     201    0.374          0.0
 Answer  Votes  Score  MinEntropy
      0      6 13.522       0.374

Final Answer: 0


Problem: What is $0\times10$?
Type=general | Difficulty=medium | Budget=900s | 7 TIR + 5 COT



2026-02-21 08:53:50,422 [INFO] Early stop: answer=0 with 6 votes (TIR-backed)


 Attempt Type  Answer  PyErr  Tokens  Entropy  VerifyScore
      10  COT       0      0     114    0.181          0.0
      11  COT       0      0     195    0.292          0.0
       0  TIR       0      0     201    0.431          0.0
       2  TIR       0      0     201    0.492          0.0
       4  TIR       0      0     201    0.497          0.0
       5  TIR       0      0     201    0.579          0.0
 Answer  Votes  Score  MinEntropy
      0      6 14.591       0.181

Final Answer: 0


Problem: What is $1-1$?
Type=general | Difficulty=medium | Budget=900s | 7 TIR + 5 COT



2026-02-21 08:53:54,365 [INFO] Early stop: answer=0 with 6 votes (TIR-backed)


 Attempt Type  Answer  PyErr  Tokens  Entropy  VerifyScore
       9  COT       0      0     184    0.235          0.0
       0  TIR       0      0     201    0.435          0.0
       2  TIR       0      0     201    0.438          0.0
       1  TIR       0      0     201    0.612          0.0
       4  TIR       0      0     201    0.249          0.0
       6  TIR       0      0     201    0.474          0.0
 Answer  Votes  Score  MinEntropy
      0      6 16.253       0.235

Final Answer: 0



In [2]:
# import time
# import pandas as pd
# import polars as pl
# import gc

# # ============================================================================
# # 1. THE OFFICIAL AIMO3 REFERENCE DATASET (NOV 2025)
# # ============================================================================
# AIMO3_REFERENCE_BENCH = [
#     {
#         "id": "REF-01 (SWEETS)",
#         "question": "Alice and Bob are each holding some integer number of sweets. Alice says to Bob: 'If we each added the number of sweets we‚Äôre holding to our (positive integer) age, my answer would be double yours. If we took the product, then my answer would be four times yours.' Bob replies: 'Why don‚Äôt you give me five of your sweets because then both our sum and product would be equal.' What is the product of Alice and Bob‚Äôs ages?",
#         "answer": 50
#     },
#     {
#         "id": "REF-02 (RECTIL)",
#         "question": "A 500 x 500 square is divided into k rectangles, each having integer side lengths. Given that no two of these rectangles have the same perimeter, the largest possible value of k is K. What is the remainder when K is divided by 10^5?",
#         "answer": 520
#     },
#     {
#         "id": "REF-03 (MINPER)",
#         "question": "Let ABC be an acute-angled triangle with integer side lengths and AB < AC. Points D and E lie on segments BC and AC, respectively, such that AD = AE = AB. Line DE intersects AB at X. Circles BXD and CED intersect for the second time at Y != D. Suppose that Y lies on line AD. There is a unique such triangle with minimal perimeter. This triangle has side lengths a = BC, b = CA, and c = AB. Find the remainder when abc is divided by 10^5.",
#         "answer": 336
#     },
#     {
#         "id": "REF-04 (FUNVAL)",
#         "question": "Let f: Z+ -> Z+ be a function such that for all positive integers m and n, f(m) + f(n) = f(m + n + mn). Across all functions f such that f(n) <= 1000 for all n <= 1000, how many different values can f(2024) take?",
#         "answer": 580
#     },
#     {
#         "id": "REF-05 (RUNNERS)",
#         "question": "A tournament is held with 2^20 runners each of which has a different running speed. In each race, two runners compete against each other with the faster runner always winning the race. The competition consists of 20 rounds with each runner starting with a score of 0. In each round, the runners are paired in such a way that in each pair, both runners have the same score at the beginning of the round. The winner of each race in the i-th round receives 2^(20-i) points and the loser gets no points. At the end of the tournament, we rank the competitors according to their scores. Let N denote the number of possible orderings of the competitors at the end of the tournament. Let k be the largest positive integer such that 10^k divides N. What is the remainder when k is divided by 10^5?",
#         "answer": 21818
#     },
#     {
#         "id": "REF-06 (HERMITE)",
#         "question": "Define a function f: Z+ -> Z+ by f(n) = sum_{i=1 to n} sum_{j=1 to n} j^1024 floor( 1/j + (n-i)/n ). Let M = 2*3*5*7*11*13 and let N = f(M^15) - f(M^15 - 1). Let k be the largest non-negative integer such that 2^k divides N. What is the remainder when 2^k is divided by 5^7?",
#         "answer": 32951
#     },
#     {
#         "id": "REF-07 (N-TASTIC)",
#         "question": "Let ABC be a triangle with AB != AC, circumcircle Omega, and incircle omega. Let the contact points of omega with BC, CA, and AB be D, E, and F, respectively. Let the circumcircle of AFE meet Omega at K and let the reflection of K in EF be K'. Let N denote the foot of the perpendicular from D to EF. The circle tangent to line BN and passing through B and K intersects BC again at T != B. Let sequence (Fn) be defined by F0 = 0, F1 = 1 and for n >= 2, Fn = Fn-1 + Fn-2. Call ABC n-tastic if BD = Fn, CD = Fn+1, and KNK'B is cyclic. Across all n-tastic triangles, let a_n denote the maximum possible value of (CT*NB)/(BT*NE). Let alpha denote the smallest real number such that for all sufficiently large n, a_{2n} < alpha. Given that alpha = p + sqrt(q) for rationals p and q, what is the remainder when floor(p^q) is divided by 99991?",
#         "answer": 57447
#     },
#     {
#         "id": "REF-08 (DIGITSUM)",
#         "question": "On a blackboard, Ken starts off by writing a positive integer n and then applies the following move until he first reaches 1. Given that the number on the board is m, he chooses a base b, where 2 <= b <= m, and considers the unique base-b representation of m. Ken then erases m on the blackboard and replaces it with the sum of its base-b digits. Across all choices of 1 <= n <= 10^{10^5}, the largest possible number of moves Ken could make is M. What is the remainder when M is divided by 10^5?",
#         "answer": 32193
#     },
#     {
#         "id": "REF-09 (SHIFTY)",
#         "question": "Let F be the set of functions alpha: Z -> Z for which there are only finitely many n in Z such that alpha(n) != 0. For two functions alpha and beta in F, define their product alpha * beta to be sum_{n in Z} alpha(n)*beta(n). Also, for n in Z, define a shift operator Sn: F -> F by Sn(alpha)(t) = alpha(t + n) for all t in Z. A function alpha in F is called shifty if alpha(m) = 0 for all integers m < 0 and m > 8 and there exists beta in F and integers k != l such that for all n in Z, Sn(alpha) * beta = 1 if n in {k, l} and 0 otherwise. How many shifty functions are there in F?",
#         "answer": 160
#     },
#     {
#         "id": "REF-10 (NORWEGIAN)",
#         "question": "Let n >= 6 be a positive integer. We call a positive integer n-Norwegian if it has three distinct positive divisors whose sum is equal to n. Let f(n) denote the smallest n-Norwegian positive integer. Let M = 3^{2025!} and for a non-negative integer c define g(c) = floor( 2025! * f(M + c) / M ) / 2025!. We can write g(0) + g(4M) + g(1848374) + g(10162574) + g(265710644) + g(44636594) = p/q where p and q are coprime positive integers. What is the remainder when p + q is divided by 99991?",
#         "answer": 8687
#     }
# ]

# # ============================================================================
# # 2. RUNNER LOGIC
# # ============================================================================

# def run_aimo3_reference_challenge():
#     print(f"\n{'='*85}")
#     print(f"üöÄ STARTING OFFICIAL AIMO3 REFERENCE BENCHMARK (10 PROBLEMS)")
#     print(f"Goal: Match frontier commercial models (9-10/10)")
#     print(f"{'='*85}\n")
    
#     score_card = []
#     correct_count = 0
#     total_start = time.time()

#     for item in AIMO3_REFERENCE_BENCH:
#         prob_id = item['id']
#         question_text = item['question']
#         expected_answer = item['answer']
        
#         print(f"üìù Testing {prob_id}...")
        
#         # Prepare inputs as Polars DataFrames (simulating Kaggle environment)
#         # Using [0,0] compatible extraction in the predict function
#         id_df = pl.DataFrame({'id': [prob_id]})
#         q_df = pl.DataFrame({'question': [question_text]})
        
#         start_time = time.time()
        
#         try:
#             # Invokes the user's predict function
#             # Ensure predict() uses id_value = id_[0, 0] or similar
#             output_df = predict(id_df, q_df)
            
#             # Extract the answer safely
#             predicted_answer = int(output_df[0, "answer"])
#         except Exception as e:
#             print(f"‚ùå Error during solve of {prob_id}: {e}")
#             predicted_answer = -99999
            
#         duration = time.time() - start_time
#         is_correct = (predicted_answer == expected_answer)
        
#         if is_correct:
#             correct_count += 1
#             status = "‚úÖ PASS"
#         else:
#             status = f"‚ùå FAIL (Exp: {expected_answer}, Got: {predicted_answer})"
            
#         print(f"Result: {status} | Time: {duration:.1f}s\n")
        
#         score_card.append({
#             "Problem": prob_id,
#             "Result": status,
#             "Time (s)": round(duration, 1)
#         })

#     total_time = (time.time() - total_start) / 60
    
#     print(f"\n{'='*85}")
#     print(f"üìä FINAL SCORECARD")
#     print(f"{'='*85}")
#     df_results = pd.DataFrame(score_card)
#     print(df_results.to_string(index=False))
#     print(f"{'='*85}")
#     print(f"Total Correct: {correct_count} / 10")
#     print(f"Accuracy:      {(correct_count/10)*100:.1f}%")
#     print(f"Total Duration: {total_time:.2f} minutes")
#     print(f"{'='*85}\n")

# # ============================================================================
# # 3. EXECUTION
# # ============================================================================

# if __name__ == "__main__":
#     if 'predict' in globals():
#         run_aimo3_reference_challenge()
#     else:
#         print("Error: 'predict' function not found. Please run your solver code first.")