### COHERE API CALL


In [None]:
import json
import time
import cohere

from cohere.core.api_error import ApiError
CohereError = ApiError
TooManyRequestsError = ApiError

INPUT_FILE = "final-odyssey-math-with-levels.jsonl"
OUTPUT_FILE = "predictions.json"
API_KEY = "dgU2sfG2g4RjM49nxkHzYEWj6ho1L80LKka08EsP"
MODEL = "command-a-03-2025"

co = cohere.ClientV2(API_KEY)
system_message = "Give only the exact numeric answer in less than 5 words. No words. No steps."

# SAFE API CALL (handles 429, 502, HTML, JSON errors)
def safe_api_call(question_text):
    retries = 7

    for attempt in range(retries):
        try:
            res = co.chat(
                model=MODEL,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": question_text}
                ]
            )

            return res.message.content[0].text.strip()

        except TooManyRequestsError:
            wait = 10 * (attempt + 1)
            print(f"[429] Rate limit — waiting {wait}s...")
            time.sleep(wait)

        except CohereError as e:
            # Handles HTML/non-JSON or 502 errors
            if "502" in str(e) or "Server Error" in str(e):
                wait = 5 * (attempt + 1)
                print(f"[502] Temporary server error — retrying in {wait}s...")
                time.sleep(wait)
                continue

            print("CohereError:", e)
            time.sleep(3)

        except Exception as e:
            print("Unexpected error:", e)
            time.sleep(3)

    return "NO_RESPONSE"

# LOAD ORIGINAL QUESTIONS FROM JSONL
questions = {}

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        data = json.loads(line)
        problem_id = list(data.keys())[0]
        question_text = data[problem_id]["question"]

        questions[problem_id] = question_text

print(f"Loaded {len(questions)} questions.")

# PROCESS ALL QUESTIONS + SAVE ANSWERS
results = {}

for problem_id, question_text in questions.items():
    print(f"\nProcessing {problem_id}...")

    answer = safe_api_call(question_text)
    results[problem_id] = answer

    print(f" → {answer}")

    time.sleep(0.8)   # prevent reaching rate limit quickly

# SAVE OUTPUT AS JSON
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)

print(f"\nFinished! Saved results in {OUTPUT_FILE}")


Loaded 250 questions.

Processing Problem_1...
 → 1013

Processing Problem_2...
 → The minimum value of $l$ is $\boxed{126}$.

Processing Problem_3...
 → 51

Processing Problem_4...
 → To determine \( f(2020) \), we need to analyze the conditions given for a number \( k \) to be a "fan" of \( n = 2020 \). Specifically, \( k \) must satisfy the following conditions:
1. \( 0 \leq k \leq 2019 \)
2. There exist integers \( x, y, z \) such that:
   \[
   x^2 + y^2 + z^2 \equiv 0 \pmod{2020}
   \]
   \[
   xyz \equiv k \pmod{2020}
   \]

First, factorize \( 2020 \):
\[
2020 = 2^2 \times 5 \times 101
\]

We will use the Chinese Remainder Theorem (CRT) to break down the problem into solving the conditions modulo \( 4 \), \( 5 \), and \( 101 \).

### Step 1: Analyze modulo 4
For \( n = 4 \):
\[
x^2 + y^2 + z^2 \equiv 0 \pmod{4}
\]
The possible values of \( x^2 \pmod{4} \) are \( 0 \) or \( 1 \). The only way the sum of three squares can be \( 0 \pmod{4} \) is if all three squares are \( 0 \pmod

### GEMINI API CALL


In [None]:
import json
import requests
import time

API_KEY = "AIzaSyCM_Uk7V2sMHy_0hhCMGbMgn3YqY7QqW6w"

endpoint = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent?key={API_KEY}"

session = requests.Session()


def safe_text(text, max_len=15000):
    """Trim overly long questions (Gemini fails above ~15k chars)."""
    if len(text) > max_len:
        return text[:max_len]
    return text


def call_gemini(question):
    """Send request with retry logic."""
    payload = {
        "contents": [
            {
                "role": "user",
                "parts": [{"text": safe_text(question)}]
            }
        ],
        "systemInstruction": {
            "role": "system",
            "parts": [
                {
                    "text": "You are a Math assistant. Always give strictly numeric answers, 1–5 words only. No explanations."
                }
            ]
        }
    }

    for attempt in range(3):  # retry 3 times
        try:
            response = session.post(endpoint, json=payload, timeout=30)
            data = response.json()

            answer = (
                data.get("candidates", [{}])[0]
                .get("content", {})
                .get("parts", [{}])[0]
                .get("text")
            )

            if answer:
                return answer.strip()

        except Exception as e:
            print(f"Error (attempt {attempt+1}): {e}")

        time.sleep(1.5)  # wait before retry

    return "No response"

input_file = "final-odyssey-math-with-levels.jsonl"
output_file = "answers.json"
failed_file = "failed_questions.json"

answers_output = {}
failed_questions = {}

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        obj = json.loads(line)
        problem_key = list(obj.keys())[0]
        question = obj[problem_key]["question"]

        print(f"Processing {problem_key}...")

        answer = call_gemini(question)

        if answer == "No response":
            failed_questions[problem_key] = question
        else:
            answers_output[problem_key] = answer


# Save answers
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(answers_output, f, indent=4)

# Save failed list
with open(failed_file, "w", encoding="utf-8") as f:
    json.dump(failed_questions, f, indent=4)

print("\nDone!")
print(f"Successful answers: {len(answers_output)}")
print(f"Failed questions saved to {failed_file}")

Processing Problem_1...
Processing Problem_2...
Processing Problem_3...
Processing Problem_4...
Processing Problem_5...
Processing Problem_6...
Processing Problem_7...
Processing Problem_8...
Processing Problem_9...
Processing Problem_10...
Processing Problem_11...
Processing Problem_12...
Processing Problem_13...
Processing Problem_14...
Processing Problem_15...
Processing Problem_16...
Processing Problem_17...
Processing Problem_18...
Processing Problem_19...
Processing Problem_20...
Processing Problem_21...
Processing Problem_22...
Processing Problem_23...
Processing Problem_24...
Processing Problem_25...
Processing Problem_26...
Processing Problem_27...
Processing Problem_28...
Processing Problem_29...
Processing Problem_30...
Processing Problem_31...
Processing Problem_32...
Processing Problem_33...
Processing Problem_34...
Processing Problem_35...
Processing Problem_36...
Processing Problem_37...
Processing Problem_38...
Processing Problem_39...
Processing Problem_40...
Processin

### Conversion of JSON file to CSV file

### Correct answers

In [None]:
import json
import csv

INPUT_FILE = "final-odyssey-math-with-levels.jsonl"
OUTPUT_FILE = "original.csv"

rows = []

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        data = json.loads(line)

        # Each line has exactly one problem key
        problem_id = list(data.keys())[0]
        content = data[problem_id]

        answer = content.get("answer", "").replace("\\\\", "").replace("\n", "").strip()
        label = content.get("label", "")
        level = content.get("level", "")

        rows.append([problem_id, answer, label, level])

# Write CSV
with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["problem_id", "answer", "label", "level"])
    writer.writerows(rows)

print(f"Saved {len(rows)} rows to {OUTPUT_FILE}")

Saved 387 rows to original.csv


### Cohere Answers

In [None]:
import json
import csv

INPUT_JSON = "predictions.json"
OUTPUT_CSV = "cohere.csv"


with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)


with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)


    writer.writerow(["problem_id", "answer"])

    for problem_id, answer in data.items():
        writer.writerow([problem_id, answer])

print(f"Saved CSV as {OUTPUT_CSV}")

Saved CSV as cohere.csv


### Gemini Answers

In [None]:
import json
import csv
import re


INPUT_JSON = "answers.json"   #JSON file
OUTPUT_CSV = "gemini.csv"  # output CSV file
TOTAL_PROBLEMS = 387     # total expected problems

# Load JSON file
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

def get_number(pid):
    match = re.search(r"Problem_(\d+)", pid)
    return int(match.group(1)) if match else None

# Create CSV
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)

    # Header
    writer.writerow(["problem_id", "answer"])

    # Fill sequence 1 → TOTAL_PROBLEMS
    for num in range(1, TOTAL_PROBLEMS + 1):
        pid = f"Problem_{num}"

        if pid in data:
            answer = data[pid]
        else:
            answer = "NA"     # missing → write NA

        writer.writerow([pid, answer])

print(f"Saved CSV with missing problems filled as 'NA' → {OUTPUT_CSV}")

Saved CSV with missing problems filled as 'NA' → gemini.csv


### Accuracy comparison

In [None]:
import pandas as pd

# Load files
original = pd.read_csv("original.csv")
cohere = pd.read_csv("cohere.csv")
gemini = pd.read_csv("gemini.csv")

# Normalize problem_id so merge works properly
for df in [original, cohere, gemini]:
    df["problem_id"] = df["problem_id"].astype(str).str.strip()

# Merge predictions with ground truth
cohere_merged = original.merge(cohere, on="problem_id", how="left", suffixes=("_true", "_pred"))
gemini_merged = original.merge(gemini, on="problem_id", how="left", suffixes=("_true", "_pred"))

# Function to clean answer fields for comparison
def clean(x):
    if pd.isna(x):
        return None
    x = str(x).strip()
    x = x.replace(" ", "")
    x = x.replace("\\\\", "")
    x = x.replace("\n", "")
    return x

cohere_merged["answer_true_clean"] = cohere_merged["answer_true"].apply(clean)
cohere_merged["answer_pred_clean"] = cohere_merged["answer_pred"].apply(clean)

gemini_merged["answer_true_clean"] = gemini_merged["answer_true"].apply(clean)
gemini_merged["answer_pred_clean"] = gemini_merged["answer_pred"].apply(clean)

# Function to compute accuracy and counts
def compute_accuracy(df):
    total = len(df)
    compared_df = df.dropna(subset=["answer_pred_clean"])
    compared_count = len(compared_df)
    skipped_count = total - compared_count

    overall_accuracy = (compared_df["answer_true_clean"] == compared_df["answer_pred_clean"]).mean()

    label_acc = compared_df.groupby("label").apply(
        lambda g: (g["answer_true_clean"] == g["answer_pred_clean"]).mean()
    )

    level_acc = compared_df.groupby("level").apply(
        lambda g: (g["answer_true_clean"] == g["answer_pred_clean"]).mean()
    )

    return overall_accuracy, label_acc, level_acc, total, compared_count, skipped_count

# Compute for APIs
cohere_acc, cohere_label_acc, cohere_level_acc, coh_total, coh_comp, coh_skip = compute_accuracy(cohere_merged)
gemini_acc, gemini_label_acc, gemini_level_acc, gem_total, gem_comp, gem_skip = compute_accuracy(gemini_merged)

# Print results
print("\n=== Cohere Accuracy ===")
print(f"Total problems: {coh_total}")
print(f"Compared (non-NA): {coh_comp}")
print(f"Skipped (NA predictions): {coh_skip}")
print("Overall Accuracy:", round(cohere_acc * 100, 2), "%")
print("\nAccuracy by Label:\n", cohere_label_acc)
print("\nAccuracy by Level:\n", cohere_level_acc)

print("\n=== Gemini Accuracy ===")
print(f"Total problems: {gem_total}")
print(f"Compared (non-NA): {gem_comp}")
print(f"Skipped (NA predictions): {gem_skip}")
print("Overall Accuracy:", round(gemini_acc * 100, 2), "%")
print("\nAccuracy by Label:\n", gemini_label_acc)
print("\nAccuracy by Level:\n", gemini_level_acc)



=== Cohere Accuracy ===
Total problems: 387
Compared (non-NA): 387
Skipped (NA predictions): 0
Overall Accuracy: 28.42 %

Accuracy by Label:
 label
Algebra                                0.258278
Calculus                               0.400000
Calculus and Analysis                  0.291667
Combinatorics                          0.054054
Differential Equations                 0.642857
Geometry                               0.333333
Linear Algebra and Abstract Algebra    0.200000
NumberTheory                           0.000000
PreCalculus                            0.595745
Probability                            0.047619
Series                                 0.000000
Statistics                             0.235294
Trigonometry                           0.000000
dtype: float64

Accuracy by Level:
 level
college math               0.257426
high school competition    0.074324
high school math           0.528986
dtype: float64

=== Gemini Accuracy ===
Total problems: 387
Compared (non-NA)