In [1]:
from pathlib import Path
from typing import Literal
from custom_tiny_bench.processor.benchmark_processor import (
    BenchmarkConfig,
    EvaluationResult,
)
from custom_tiny_bench.tiny_benchmark import TinyBenchmark
import logging
logging.basicConfig(level=logging.DEBUG,
                    force = True)

save_dir = Path("data")
bm_configs: list[BenchmarkConfig] = [
    BenchmarkConfig(
        name="gqa",
        results=[
            EvaluationResult(
                prediction_file="data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json",
                model="instructblip-vicuna-7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json",
                model="llava-v1.5-7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/prism-clip+7b/gqa-formatted-predictions.json",
                model="prism-clip+7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/prism-dinosiglip+7b/gqa-formatted-predictions.json",
                model="prism-dinosiglip+7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/prism-siglip+7b/gqa-formatted-predictions.json",
                model="prism-siglip+7b",
            ),
        ],
        question_file="data/gqa/questions.json",
        subscenario_keyword="structural_type"
    )
]
train_size: int | float = 4
device = "cpu"
number_item: int = 100
random_state: int = 42
clustering: Literal["irt", "correct."] = "irt"
p_irt: bool = True
gp_irt = True
epochs = 2000


In [2]:
tinybm = TinyBenchmark(save_dir)
# print(tinybm.bm_to_proc["gqa"].predictions.predictions_per_model)
# tinybm.prepare_data(bm_configs)
# tinybm.train_irt(train_size, device)
# tinybm.get_anchors(number_item, random_state, clusterting)
# tinybm.estimate_performance(p_irt, gp_irt)


In [3]:
tinybm.prepare_data(bm_configs)

INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/questions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Naive accuracy of instructblip-vicuna-7b: 0.48386
INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Naive accuracy of llava-v1.5-7b: 0.61735
INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/prism-clip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Naive accuracy of prism-clip+7b: 0.64573
INFO:custom_tiny_bench.p

Config: ['instructblip-vicuna-7b', 'llava-v1.5-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']


In [4]:
tinybm.train_irt(train_size, device, epochs)

  0%|          | 0/2 [00:00<?, ?it/s]

[22:38:21] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=5 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_val_dataset.jsonlines                  cli.py:111
           output directory: data/irt_val_model                       cli.py:112
[22:38:21] amortized: False                                       dataset.py:112
[22:38:21] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

 50%|█████     | 1/2 [00:22<00:22, 22.13s/it]

[22:38:43] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=10 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_val_dataset.jsonlines                  cli.py:111
           output directory: data/irt_val_model                       cli.py:112
[22:38:43] amortized: False                                       dataset.py:112
[22:38:43] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

100%|██████████| 2/2 [00:50<00:00, 25.13s/it]


[22:39:12] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=5 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_dataset.jsonlines                      cli.py:111
           output directory: data/irt_model                           cli.py:112
[22:39:12] amortized: False                                       dataset.py:112
[22:39:12] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 4}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

In [5]:
anchor = tinybm.get_anchors(number_item, random_state, clustering= clustering)


INFO:custom_tiny_bench.tiny_benchmark:[Anchor points] scenario: gqa, avg. error: 0.0128091441


Points [ 2819   102  9204 10786 12171  7025 10717  2781 11565  5975  7119  2435
  5291  9817  1577  8157  8145  8807 12090  8300  8393   874 10515 10863
  3146 11888  6145  1776  8101  2166  8552  7601  2281  1091 11168 10169
   271   952  7269 12443  4113  7564  3876  4929  3746  4325  9090   340
  2084 11412  7576  8654  1328  7276  8014  9209  1116  4028  5361  4615
  9716  9511  9910   696 10351 10468 10171  5971 11271  4997  8507  6629
  3520 10723  7378  9107 12523  5509  9763 12484  1684 12503 11433  6790
 10866   346  7490  8061  4440  1098  5278  3499  2677  5882  2304  5993
  6795  2289  1534  7230]


In [6]:
res = tinybm.estimate_performance(p_irt=p_irt, gp_irt=gp_irt)

100%|██████████| 1/1 [00:00<00:00, 53.37it/s]

[IRT] predicted score for 0_th model in gqa: 0.722675





[p-IRT] predicted score for 0_th model in gqa: 0.755298
[gp-IRT] predicted score for 0_th model in gqa: 0.723707


In [17]:
import numpy as np 

# Random sampling 
qids = np.array(range(12578))
rng = np.random.default_rng(seed=42)

for scenario in tinybm.scenarios_position.keys():
    random_points = rng.choice(qids, size=number_item, replace=False)
    equal_weights = np.array([1/number_item for _ in range(number_item)])

    Y_random = tinybm.test_data[:,tinybm.scenarios_position[scenario]][:,random_points]
    Y_hat = (Y_random*equal_weights).sum(axis=1)
    Balanced_true = (tinybm.balance_weights*tinybm.test_data)[:,tinybm.scenarios_position[scenario]].mean(axis=1) 

    print(f"Result on {scenario}")
    for i in range(Balanced_true.shape[0]):
        print(f"Model {i}")
        print(f"    [Balanced true accruacy] {Balanced_true[i]:.5f}")
        print(f"    [Random] scenario: {scenario}, avg. error: {np.abs(Y_hat[i]-Balanced_true[i]):.5f}")
        print(f"    [IRT] scenario: {scenario}, avg. error: {np.abs(res[0][scenario][i]-Balanced_true[i]):.5f}")
        print(f"    [p-IRT] scenario: {scenario}, avg. error: {np.abs(res[1][scenario][i]-Balanced_true[i]):.5f}")
        print(f"    [gp-IRT] scenario: {scenario}, avg. error: {np.abs(res[2][scenario][i]-Balanced_true[i]):.5f}")


Result on gqa
Model 0
    [Balanced true accruacy] 0.73548
    [Random] scenario: gqa, avg. error: 0.06548
    [IRT] scenario: gqa, avg. error: 0.01281
    [p-IRT] scenario: gqa, avg. error: 0.01981
    [gp-IRT] scenario: gqa, avg. error: 0.01178
