In [1]:
import sys

# In order to import custom tiny bench from example directory
sys.path.append("../")

from pathlib import Path
from typing import Literal
from custom_tiny_bench.processor.benchmark_processor import (
    BenchmarkConfig,
    EvaluationResult,
)
from custom_tiny_bench.tiny_benchmark import TinyBenchmark
import logging
logging.basicConfig(level=logging.DEBUG,
                    force = True)

save_dir = Path("../data")
bm_configs: list[BenchmarkConfig] = [
    BenchmarkConfig(
        name="gqa",
        results=[
            EvaluationResult(
                prediction_file="../data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json",
                model="instructblip-vicuna-7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json",
                model="llava-v1.5-7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/prism-clip+7b/gqa-formatted-predictions.json",
                model="prism-clip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/prism-dinosiglip+7b/gqa-formatted-predictions.json",
                model="prism-dinosiglip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/prism-siglip+7b/gqa-formatted-predictions.json",
                model="prism-siglip+7b",
            ),
        ],
        question_file="../data/gqa/questions.json",
        subscenario_keyword="structural_type"
    ),
    BenchmarkConfig(
        name="text-vqa",
        results=[
            EvaluationResult(
                prediction_file="../data/text-vqa/instructblip-vicuna-7b/results+rank-0.json",
                model="instructblip-vicuna-7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/llava-v1.5-7b/results+rank-0.json",
                model="llava-v1.5-7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/prism-clip+7b/results+rank-0.json",
                model="prism-clip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/prism-dinosiglip+7b/results+rank-0.json",
                model="prism-dinosiglip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/prism-siglip+7b/results+rank-0.json",
                model="prism-siglip+7b",
            ),
        ],
        question_file="../data/text-vqa/annotations-textvqa-full.json",
    )
]
train_size: int | float = 4
device = "cpu"
number_item: int = 100
random_state: int = 42
clustering: Literal["irt", "correct."] = "irt"
p_irt: bool = True
gp_irt = True
epochs = 2000


In [2]:
tinybm = TinyBenchmark(save_dir, balance=False)


In [3]:
tinybm.prepare_data(bm_configs)

INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/questions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/prism-clip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/prism-dinosiglip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/prism-siglip+7b/gqa-form

Config: ['instructblip-vicuna-7b', 'llava-v1.5-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']


INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/llava-v1.5-7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/prism-clip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/prism-dinosiglip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/prism-siglip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:[create_correctness_array] Shape of correctness array (5, 5000)


Config: ['instructblip-vicuna-7b', 'llava-v1.5-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']


100%|██████████| 100/100 [00:00<00:00, 8886.43it/s]
INFO:custom_tiny_bench.processor.benchmark_processor:After binarize, error is [0.00248 0.0033  0.00224 0.00312 0.00322]
INFO:custom_tiny_bench.tiny_benchmark:[prepare_data] correctness_array.shape == (5, 17578)


In [4]:
tinybm.train_irt(train_size, device, epochs)

  0%|          | 0/2 [00:00<?, ?it/s]

[20:54:48] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=5 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: ../data/datasets/irt_val_dataset.jsonlines      cli.py:111
           output directory: ../data/models/irt_val_model             cli.py:112
[20:54:48] amortized: False                                       dataset.py:112
[20:54:48] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 17578,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           17578, 'num_subje

 50%|█████     | 1/2 [00:26<00:26, 26.71s/it]

[20:55:17] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=10 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: ../data/datasets/irt_val_dataset.jsonlines      cli.py:111
           output directory: ../data/models/irt_val_model             cli.py:112
[20:55:17] amortized: False                                       dataset.py:112
[20:55:17] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 17578,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           17578, 'num_subje

100%|██████████| 2/2 [01:03<00:00, 31.93s/it]


[20:55:52] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=10 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: ../data/datasets/irt_dataset.jsonlines          cli.py:111
           output directory: ../data/models/irt_model                 cli.py:112
[20:55:52] amortized: False                                       dataset.py:112
[20:55:52] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 17578,           training.py:134
           'num_subjects': 4}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           17578, 'num_subje

In [5]:
anchor = tinybm.get_anchors(number_item, random_state, clustering= clustering)


INFO:custom_tiny_bench.tiny_benchmark:Points for gqa: [11525   479  7320  1545 12222  1308 11746 12289  1264  7145  9775  1191
 10475  2080 10570  3702  1987  7522   312 10377    41  7739   708  4051
  2901  1436  7115  1946 11551  3867  2240  2728  4701  9602 11073  3277
  6162 10504  3202  1545  9525 11629  8216  5025    95 11664  1184  1658
  7323   309  6283 11845  9590 12439  3766  2339  4641 12053 10447  5008
  6852 10328 10482  8283  1691 10258  6505  4994  1179  9825 10468  2460
  5192  4246  1011 11419  3266  7478 11773  4838  1378   342   547  2892
  5816  5598  6821 11141 10613  3005  3376  8894  8395  3345 11208  5544
  9588  9716  6775  3681]
INFO:custom_tiny_bench.tiny_benchmark:Points for text-vqa: [ 860 3713 4524   67 3751 4075  459 3759 3797  600 4631 3456  845 4006
 1768 4789 2974 2320 4644 4729  576  675 3716 2221 3565 3184 4475 4027
 1434  694 2597  309 2188 2987 4040 1919 4561 2258 1099  393 1269 1660
 3999 3624 1966 1487 3060 3302 3170 4376  100 2201 4400 4585 119

In [6]:
res = tinybm.estimate_performance(p_irt=p_irt, gp_irt=gp_irt)

100%|██████████| 1/1 [00:00<00:00, 53.30it/s]
INFO:custom_tiny_bench.estimator:[Naive accuracy]: 0.610000
INFO:custom_tiny_bench.estimator:[IRT] predicted score for 0_th model in gqa: 0.642471
INFO:custom_tiny_bench.estimator:[Naive accuracy]: 0.660000
INFO:custom_tiny_bench.estimator:[IRT] predicted score for 0_th model in text-vqa: 0.630400
INFO:custom_tiny_bench.estimator:[p-IRT] predicted score for 0_th model in gqa: 0.662615
INFO:custom_tiny_bench.estimator:[p-IRT] predicted score for 0_th model in text-vqa: 0.621258
INFO:custom_tiny_bench.estimator:[gp-IRT] predicted score for 0_th model in gqa: 0.644777
INFO:custom_tiny_bench.estimator:[gp-IRT] predicted score for 0_th model in text-vqa: 0.629734


In [7]:
import numpy as np 

# Random sampling 
qids = np.array(range(5000))
rng = np.random.default_rng(seed=42)

for scenario in tinybm.scenarios_position.keys():
    random_points = rng.choice(qids, size=number_item, replace=False)
    equal_weights = np.array([1/number_item for _ in range(number_item)])

    Y_random = tinybm.test_data[:,tinybm.scenarios_position[scenario]][:,random_points]
    Y_hat = (Y_random*equal_weights).sum(axis=1)
    Balanced_true = (tinybm.balance_weights*tinybm.test_data)[:,tinybm.scenarios_position[scenario]].mean(axis=1) 

    print(f"Result on {scenario}")
    for i in range(Balanced_true.shape[0]):
        print(f"Model {i}")
        print(f"    [Balanced true accruacy] {Balanced_true[i]:.5f}")
        print(f"    [Random] scenario: {scenario}, avg. error: {np.abs(Y_hat[i]-Balanced_true[i]):.5f}")
        print(f"    [IRT] scenario: {scenario}, avg. error: {np.abs(res[0][scenario][i]-Balanced_true[i]):.5f}")
        print(f"    [p-IRT] scenario: {scenario}, avg. error: {np.abs(res[1][scenario][i]-Balanced_true[i]):.5f}")
        print(f"    [gp-IRT] scenario: {scenario}, avg. error: {np.abs(res[2][scenario][i]-Balanced_true[i]):.5f}")


Result on gqa
Model 0
    [Balanced true accruacy] 0.64406
    [Random] scenario: gqa, avg. error: 0.03594
    [IRT] scenario: gqa, avg. error: 0.00159
    [p-IRT] scenario: gqa, avg. error: 0.01855
    [gp-IRT] scenario: gqa, avg. error: 0.00072
Result on text-vqa
Model 0
    [Balanced true accruacy] 0.62520
    [Random] scenario: text-vqa, avg. error: 0.01480
    [IRT] scenario: text-vqa, avg. error: 0.00520
    [p-IRT] scenario: text-vqa, avg. error: 0.00394
    [gp-IRT] scenario: text-vqa, avg. error: 0.00453
