In [None]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from slugify import slugify

import json, math
pd.options.display.float_format = '{:.2f}'.format

from functools import partial

In [None]:

# List of implementations to check
implementations = [
    "parametric-gpt",
    "parametric-gemini",
    "parametric-claude",
    "parametric-llama",
    "parametric-qwen",

    "retrieval-gpt",
    "retrieval-gemini",
    "retrieval-claude",
    "retrieval-qwen",
    "retrieval-openscholar",

    "agentic-openscholar",
    "agentic-perplexity-web-search",
    "agentic-perplexity-web-search-reasoning",
    "agentic-openai-web-search",
    "agentic-gemini-web-search",
    "agentic-anthropic-web-research",
    "agentic-perplexity-deep-research",
    "agentic-openai-deep-research",
]

# Base directory where all field folders are located
base_dir = Path("/nlp/data/academic_pdfs/multi_domain_testbed/systems_comparison")

fields = [d for d in base_dir.iterdir() if d.is_dir()]
fields = [base_dir / f for f in fields]
print(f"Num of Fields: {len(fields)}")

Num of Fields: 75


## Implementation Comparison

### Helper function to extract rubric coverage Score

In [37]:
test_set_path = "/nlp/data/academic_pdfs/multi_domain_testbed/data_release/researchqa/test.json"
with open(test_set_path, "r") as f:
    test_set = json.load(f)
id_to_data_test = {data['id']: data for data in test_set}
id_to_domain = {data['id']: data['general_domain'] for data in test_set}

In [None]:
def convert_text_to_score(text):
    text_to_score = {
        'Not at all': 1, 
        'Barely': 2, 
        'Moderately': 3, 
        'Mostly': 4, 
        'Completely': 5
    }
    return text_to_score[text]


def normalize_5scale(x):
    return (x - 1) / 4

### Leaderboard Score

Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=C5H_wlbqGwCJ

In [None]:
test_split_path = Path("/nlp/data/academic_pdfs/multi_domain_testbed/data_release/researchqa/test.json")
with open(test_split_path, encoding="utf-8") as f:
    test_set = json.load(f)

id_to_test_data = {item['id']: item for item in test_set} 

In [None]:
relative_judge_path = Path(f"/nlp/data/academic_pdfs/multi_domain_testbed/systems_comparison_relative_judge")

relative_judge_data = []
for file in relative_judge_path.glob("*.json"):
    with open(file, "r") as f:
        data = json.load(f)
    for entry in data:
        relative_judge_data.append(entry)
relative_judge_df_raw = pd.DataFrame(relative_judge_data)

relative_judge_df = relative_judge_df_raw[['session_id', 'implementation_A', 'implementation_B', 'judge', 'judge_swapped']]

relative_judge_df.head()

Unnamed: 0,session_id,implementation_A,implementation_B,judge,judge_swapped
0,174539434903306044-s15,agentic-anthropic-web-research,retrieval-claude,A,A
1,174539434903306044-s15,agentic-openai-deep-research,retrieval-qwen,A,A
2,174539434903306044-s15,parametric-gpt,retrieval-gpt,A,A
3,174539434903307756-s21,parametric-gemini,retrieval-gemini,A,A
4,174539434903307756-s21,parametric-claude,retrieval-gpt,B,B


In [67]:
len(relative_judge_df)

7650

In [None]:
def convert_text_to_score(text):
    text_to_score = {
        'Not at all': 1, 
        'Barely': 2, 
        'Moderately': 3, 
        'Mostly': 4, 
        'Completely': 5
    }
    return text_to_score[text]

def get_absolute_judge_data_by_id_and_implementation(session_id, field, implementation):
    mode = implementation.split('-')[0]
    provider = '-'.join(implementation.split('-')[1:])
    judge_path = Path("/nlp/data/academic_pdfs/multi_domain_testbed/systems_comparison") / slugify(field) / mode / f"{provider}_judge_gpt4.1mini.json"
    with open(judge_path, "r") as f:
        absolute_judge_data_overall = json.load(f)
    id_to_absolute_judge_data = {item['corpusid_sectionid']: item for item in absolute_judge_data_overall}
    absolute_judge_data = id_to_absolute_judge_data[session_id]
    return absolute_judge_data

## Runtime: 1.5 minutes
id_to_general_domain = {item['id']: item['general_domain'] for item in test_set}

id_impl_to_sum_absolute_judge_score = {}
for _, session_id, implementation_A, implementation_B in relative_judge_df[['session_id', 'implementation_A', 'implementation_B']].itertuples():
    test_data = id_to_test_data[session_id]
    general_domain, field = test_data['general_domain'], test_data['field']
    absolute_judge_data_A = get_absolute_judge_data_by_id_and_implementation(session_id, field, implementation_A)
    absolute_judge_data_B = get_absolute_judge_data_by_id_and_implementation(session_id, field, implementation_B)
    id_impl_to_sum_absolute_judge_score[(session_id, implementation_A)] = sum([convert_text_to_score(j['score']) for j in absolute_judge_data_A['rubric_judges_gpt']])
    id_impl_to_sum_absolute_judge_score[(session_id, implementation_B)] = sum([convert_text_to_score(j['score']) for j in absolute_judge_data_B['rubric_judges_gpt']])

In [None]:
leaderboard_winnder = []
for _, session_id, implementation_A, implementation_B, winner_judge, winner_judge_swapped in relative_judge_df.itertuples():

    general_domain = id_to_general_domain[session_id]
    
    score_a = id_impl_to_sum_absolute_judge_score[(session_id, implementation_A)]
    score_b = id_impl_to_sum_absolute_judge_score[(session_id, implementation_B)]

    assert winner_judge in ["A", "B"], f"winner_judge: {winner_judge}"
    assert winner_judge_swapped in ["A", "B"], f"winner_judge_swapped: {winner_judge_swapped}"

    if winner_judge == "A":
        score_a += 5
    elif winner_judge == "B":
        score_b += 5
    
    if winner_judge_swapped == "A":
        score_a += 5
    elif winner_judge_swapped == "B":
        score_b += 5

    if score_a > score_b:
        leaderboard_status = "win"  
    elif score_a == score_b:
        leaderboard_status = "tie"
    else:
        leaderboard_status = "lose"
    
    leaderboard_winnder.append({
        "session_id": session_id,
        "general_domain": general_domain,
        "implementation_A": implementation_A,
        "implementation_B": implementation_B,
        "score_A": score_a,
        "score_B": score_b,
        "leaderboard_status": leaderboard_status
    })

leaderboard_winner_df = pd.DataFrame(leaderboard_winnder)
leaderboard_winner_df.to_csv(f"/nlp/data/academic_pdfs/multi_domain_testbed/leaderboard_bootstrap/leaderboard_winner_df.csv", index=False)

leaderboard_winner_df

Unnamed: 0,session_id,general_domain,implementation_A,implementation_B,score_A,score_B,leaderboard_status
0,174539434903306044-s15,Physical & Theoretical Sciences,agentic-anthropic-web-research,retrieval-claude,43,22,win
1,174539434903306044-s15,Physical & Theoretical Sciences,agentic-openai-deep-research,retrieval-qwen,46,28,win
2,174539434903306044-s15,Physical & Theoretical Sciences,parametric-gpt,retrieval-gpt,33,28,win
3,174539434903307756-s21,Physical & Theoretical Sciences,parametric-gemini,retrieval-gemini,36,22,win
4,174539434903307756-s21,Physical & Theoretical Sciences,parametric-claude,retrieval-gpt,26,36,lose
...,...,...,...,...,...,...,...
7645,174539438544491012-s9,Life & Earth Sciences,agentic-perplexity-web-search,agentic-perplexity-web-search-reasoning,32,40,lose
7646,174539438544581152-s1,Life & Earth Sciences,agentic-openai-web-search,agentic-perplexity-web-search-reasoning,38,34,win
7647,174539438544588476-s16,Life & Earth Sciences,agentic-openai-web-search,agentic-perplexity-deep-research,27,39,lose
7648,174539439054783755-s12,Life & Earth Sciences,agentic-anthropic-web-research,agentic-openai-web-search,31,32,lose


In [54]:
leaderboard_winner_df['leaderboard_status'].value_counts()

leaderboard_status
win     4378
lose    3050
tie      222
Name: count, dtype: int64

In [57]:
len(leaderboard_winner_df)

7650

#### Bradley-Terry Model 


In [58]:
# Avoid tie by duplicate all rows, and for the rows of ties, assign a different winner in each duplicate
# Step 1: Duplicate all rows
df_dup = pd.concat([leaderboard_winner_df, leaderboard_winner_df], ignore_index=True)

# Step 2: Identify tie indices from original DataFrame
tie_indices = leaderboard_winner_df[leaderboard_winner_df['leaderboard_status'] == 'tie'].index

# Step 3: In the duplicated DataFrame:
# - First half: model A wins for tie rows
# - Second half: model B wins for tie rows

# First half (original order)
df_dup.loc[tie_indices, 'leaderboard_status'] = 'win'

# Second half (duplicated rows)
df_dup.loc[tie_indices + len(leaderboard_winner_df), 'leaderboard_status'] = 'lose'

# Resulting modeling-ready DataFrame
df_dup = df_dup.reset_index(drop=True)
df_dup

Unnamed: 0,session_id,general_domain,implementation_A,implementation_B,score_A,score_B,leaderboard_status
0,174539434903306044-s15,Physical & Theoretical Sciences,agentic-anthropic-web-research,retrieval-claude,43,22,win
1,174539434903306044-s15,Physical & Theoretical Sciences,agentic-openai-deep-research,retrieval-qwen,46,28,win
2,174539434903306044-s15,Physical & Theoretical Sciences,parametric-gpt,retrieval-gpt,33,28,win
3,174539434903307756-s21,Physical & Theoretical Sciences,parametric-gemini,retrieval-gemini,36,22,win
4,174539434903307756-s21,Physical & Theoretical Sciences,parametric-claude,retrieval-gpt,26,36,lose
...,...,...,...,...,...,...,...
15295,174539438544491012-s9,Life & Earth Sciences,agentic-perplexity-web-search,agentic-perplexity-web-search-reasoning,32,40,lose
15296,174539438544581152-s1,Life & Earth Sciences,agentic-openai-web-search,agentic-perplexity-web-search-reasoning,38,34,win
15297,174539438544588476-s16,Life & Earth Sciences,agentic-openai-web-search,agentic-perplexity-deep-research,27,39,lose
15298,174539439054783755-s12,Life & Earth Sciences,agentic-anthropic-web-research,agentic-openai-web-search,31,32,lose


In [60]:
df_dup_processed = df_dup.copy()

# Rename columns
df_dup_processed["model_a"] = df_dup_processed["implementation_A"]
df_dup_processed["model_b"] = df_dup_processed["implementation_B"]

# Set winner: if status is 'win', then A wins; else B wins
df_dup_processed["winner"] = df_dup_processed["leaderboard_status"].apply(
    lambda x: "model_a" if x == "win" else "model_b"
)

# Select the relevant columns
df_dup_processed = df_dup_processed[["model_a", "model_b", "winner"]]
df_dup_processed

Unnamed: 0,model_a,model_b,winner
0,agentic-anthropic-web-research,retrieval-claude,model_a
1,agentic-openai-deep-research,retrieval-qwen,model_a
2,parametric-gpt,retrieval-gpt,model_a
3,parametric-gemini,retrieval-gemini,model_a
4,parametric-claude,retrieval-gpt,model_b
...,...,...,...
15295,agentic-perplexity-web-search,agentic-perplexity-web-search-reasoning,model_b
15296,agentic-openai-web-search,agentic-perplexity-web-search-reasoning,model_a
15297,agentic-openai-web-search,agentic-perplexity-deep-research,model_b
15298,agentic-anthropic-web-research,agentic-openai-web-search,model_b


In [None]:
from typing import Any, List, Optional, Tuple

def compute_bootstrap_bt(
    df: pd.DataFrame,
    num_round: int,
    base: float = 10.0,
    scale: float = 400.0,
    init_rating: float = 1000.0,
    tol: float = 1e-6,
    offset: float = 0.0,
    anchor_model_and_rating: Optional[Tuple[Any, float]] = None,
) -> pd.DataFrame:
    """Compute bootstrap Bradley-Terry ratings."""
    
    def get_matchups_models(df: pd.DataFrame) -> Tuple[np.ndarray, List[Any]]:
        n_rows = len(df)
        model_indices, models = pd.factorize(pd.concat([df["model_a"], df["model_b"]]))
        matchups = np.column_stack([model_indices[:n_rows], model_indices[n_rows:]])
        return matchups, models.to_list()

    def preprocess_for_bt(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[Any], np.ndarray]:
        n_rows = len(df)
        schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32)
        schedule[:, [0, 1]], models = get_matchups_models(df)
        schedule[df["winner"] == "model_a", 2] = 2
        schedule[df["winner"] == "model_b", 2] = 0
        matchups_outcomes, weights = np.unique(schedule, return_counts=True, axis=0)
        matchups = matchups_outcomes[:, [0, 1]]
        outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0
        weights = weights.astype(np.float64)
        return matchups, outcomes, models, weights

    def bt_loss_and_grad(
        ratings: np.ndarray,
        matchups: np.ndarray,
        outcomes: np.ndarray,
        weights: np.ndarray,
        alpha: float = 1.0
    ) -> Tuple[float, np.ndarray]:
        matchup_ratings = ratings[matchups]
        logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
        probs = 1 / (1 + np.exp(-logits))
        loss = -((np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes)) * weights).sum()
        matchups_grads = -alpha * (outcomes - probs) * weights
        model_grad = np.zeros_like(ratings)
        np.add.at(model_grad, matchups[:, [0, 1]], matchups_grads[:, None] * np.array([1.0, -1.0]))
        return loss, model_grad

    def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6):
        from scipy.optimize import minimize
        initial_ratings = np.zeros(n_models, dtype=np.float64)
        result = minimize(
            fun=bt_loss_and_grad,
            x0=initial_ratings,
            args=(matchups, outcomes, weights, alpha),
            jac=True,
            method="L-BFGS-B",
            options={"disp": False, "maxiter": 100, "gtol": tol},
        )
        return result["x"]

    def scale_and_offset(ratings, models, scale, init_rating, anchor_model_and_rating=None):
        scaled_ratings = ratings * scale + init_rating
        if anchor_model_and_rating:
            anchor_model, anchor_rating = anchor_model_and_rating
            idx = models.index(anchor_model)
            scaled_ratings += anchor_rating - scaled_ratings[idx]
        return scaled_ratings

    matchups, outcomes, models, weights = preprocess_for_bt(df)
    rng = np.random.default_rng(seed=0)
    idxs = rng.multinomial(n=len(df), pvals=weights / weights.sum(), size=num_round)
    boot_weights = idxs.astype(np.float64) / len(df)

    bt_fn = partial(
        fit_bt, matchups, outcomes, n_models=len(models), alpha=math.log(base), tol=tol
    )

    results = [bt_fn(w) for w in boot_weights]
    ratings = np.array(results)
    scaled_ratings = scale_and_offset(
        ratings, models, scale, init_rating + offset, anchor_model_and_rating
    )

    df_out = pd.DataFrame(scaled_ratings, columns=models)
    sorted_cols = df_out.median().sort_values(ascending=False).index
    return df_out[sorted_cols]

In [66]:
boot_ratings = compute_bootstrap_bt(df_dup_processed, num_round=1000)

# Compute mean and std
mean = boot_ratings.mean()
std = boot_ratings.std()

# Format as "mean ± std" string
summary_str = mean.map('{:.0f}'.format) + ' ± ' + std.map('{:.0f}'.format)

# Combine into a DataFrame
summary_df = pd.DataFrame({
    'mean': mean,
    'std': std,
    'mean ± std': summary_str
}).sort_values(by='mean', ascending=False)

summary_df

Unnamed: 0,mean,std,mean ± std
agentic-perplexity-deep-research,1505.19,17.07,1505 ± 17
parametric-gemini,1243.88,9.84,1244 ± 10
agentic-anthropic-web-research,1148.6,9.61,1149 ± 10
agentic-openai-deep-research,1144.63,9.71,1145 ± 10
agentic-perplexity-web-search-reasoning,1114.68,9.52,1115 ± 10
parametric-claude,1099.17,9.4,1099 ± 9
parametric-gpt,1080.48,8.58,1080 ± 9
parametric-qwen,1037.58,8.85,1038 ± 9
retrieval-gpt,1019.68,8.82,1020 ± 9
retrieval-qwen,1010.93,9.02,1011 ± 9
