In [1]:
import sys

# In order to import custom tiny bench from example directory
sys.path.append("../")

from pathlib import Path
from typing import Literal
from custom_tiny_bench.processor.benchmark_processor import (
    BenchmarkConfig,
    EvaluationResult,
)
from custom_tiny_bench.tiny_benchmark import TinyBenchmark
import logging
logging.basicConfig(level=logging.DEBUG,
                    force = True)

save_dir = Path("../data")
bm_configs: list[BenchmarkConfig] = [
    BenchmarkConfig(
        name="gqa",
        results=[
            EvaluationResult(
                prediction_file="../data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json",
                model="instructblip-vicuna-7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json",
                model="llava-v1.5-7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/prism-clip+7b/gqa-formatted-predictions.json",
                model="prism-clip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/prism-dinosiglip+7b/gqa-formatted-predictions.json",
                model="prism-dinosiglip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/gqa/prism-siglip+7b/gqa-formatted-predictions.json",
                model="prism-siglip+7b",
            ),
        ],
        question_file="../data/gqa/questions.json",
        subscenario_keyword="structural_type"
    ),
    BenchmarkConfig(
        name="text-vqa",
        results=[
            EvaluationResult(
                prediction_file="../data/text-vqa/instructblip-vicuna-7b/results+rank-0.json",
                model="instructblip-vicuna-7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/llava-v1.5-7b/results+rank-0.json",
                model="llava-v1.5-7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/prism-clip+7b/results+rank-0.json",
                model="prism-clip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/prism-dinosiglip+7b/results+rank-0.json",
                model="prism-dinosiglip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/text-vqa/prism-siglip+7b/results+rank-0.json",
                model="prism-siglip+7b",
            ),
        ],
        question_file="../data/text-vqa/annotations-textvqa-full.json",
    ),
    BenchmarkConfig(
        name="pope",
        results=[
            EvaluationResult(
                prediction_file="../data/pope/instructblip-vicuna-7b/results+rank-0.json",
                model="instructblip-vicuna-7b",
            ),
            EvaluationResult(
                prediction_file="../data/pope/llava-v1.5-7b/results+rank-0.json",
                model="llava-v1.5-7b",
            ),
            EvaluationResult(
                prediction_file="../data/pope/prism-clip+7b/results+rank-0.json",
                model="prism-clip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/pope/prism-dinosiglip+7b/results+rank-0.json",
                model="prism-dinosiglip+7b",
            ),
            EvaluationResult(
                prediction_file="../data/pope/prism-siglip+7b/results+rank-0.json",
                model="prism-siglip+7b",
            ),
        ],
        question_file="../data/pope/questions.json",
        subscenario_keyword="split"
    ),
]
train_size: int | float = 4
device = "cpu"
number_item: int = 100
random_state: int = 42
clustering: Literal["irt", "correct."] = "irt"
p_irt: bool = True
gp_irt = True
epochs = 2000


In [2]:
tinybm = TinyBenchmark(save_dir, balance=False)


In [3]:
tinybm.prepare_data(bm_configs)

INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/questions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/prism-clip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/prism-dinosiglip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/gqa/prism-siglip+7b/gqa-form

Config: ['instructblip-vicuna-7b', 'llava-v1.5-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']


INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/llava-v1.5-7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/prism-clip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/prism-dinosiglip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/text-vqa/prism-siglip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 5000
INFO:custom_tiny_bench.processor.benchmark_processor:[create_correctness_array] Shape of correctness array (5, 5000)


Config: ['instructblip-vicuna-7b', 'llava-v1.5-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']


100%|██████████| 100/100 [00:00<00:00, 8880.97it/s]
INFO:custom_tiny_bench.processor.benchmark_processor:After binarize, error is [0.00248 0.0033  0.00224 0.00312 0.00322]
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/pope/questions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/pope/instructblip-vicuna-7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 8910
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/pope/llava-v1.5-7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 8910
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/pope/prism-clip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 8910
INFO:custom_tiny_bench.processor.benchmark_processor:Opening ../data/pope/prism-dinosiglip+7b/results+rank-0.json
INFO:custom_tiny_bench.processor.benchmark_proces

Config: ['instructblip-vicuna-7b', 'llava-v1.5-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']


In [4]:
tinybm.train_irt(train_size, device, epochs)

  0%|          | 0/2 [00:00<?, ?it/s]

[20:41:05] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=5 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: ../data/datasets/irt_val_dataset.jsonlines      cli.py:111
           output directory: ../data/models/irt_val_model             cli.py:112
[20:41:05] amortized: False                                       dataset.py:112
[20:41:05] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 26488,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           26488, 'num_subje

 50%|█████     | 1/2 [00:35<00:35, 35.43s/it]

[20:41:40] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=10 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: ../data/datasets/irt_val_dataset.jsonlines      cli.py:111
           output directory: ../data/models/irt_val_model             cli.py:112
[20:41:40] amortized: False                                       dataset.py:112
[20:41:40] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 26488,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           26488, 'num_subje

100%|██████████| 2/2 [01:22<00:00, 41.27s/it]


[20:42:27] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=10 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: ../data/datasets/irt_dataset.jsonlines          cli.py:111
           output directory: ../data/models/irt_model                 cli.py:112
[20:42:27] amortized: False                                       dataset.py:112
[20:42:27] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 26488,           training.py:134
           'num_subjects': 4}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           26488, 'num_subje

In [5]:
anchor = tinybm.get_anchors(number_item, random_state, clustering= clustering)


INFO:custom_tiny_bench.tiny_benchmark:Points for gqa: [ 2976    64  6709  4348  7073  2146  3701  7092  3354  4539  1517  5714
 12342  9029  1666  4085  3024  6027  2054 11973  3205  7967  2144  2761
  9039  3371  9876 11193  5340  1048  3117   234  9131 11282 10794 10040
 11064 10386 10759  4148  7296  9207  6704  9385  3466 11103  8793  3895
  7863 12318  5095  7182   286  2348  5635  7909  8356  3637  8549  7915
  9064  1109  6289  5734 11383  5318  7372 10058  9676  3326 10765  9644
  9684 11109  3695 11045  2217 11029  7441  7573  1042  2146  5541  3975
  8280  1700 12041  9512  1599  1262  9212  2776  9865  6311  4428  4435
  7718  6938  1906 10005]
INFO:custom_tiny_bench.tiny_benchmark:Points for text-vqa: [1492 3747 1605  606 2790 4041 2238  859  903 4561 4236 4631 4316 3351
 2121 1911 4335 1749 3018  272 1063 4716 1509 2068  893 2932 2949  572
 3692 2733 4152  863  138 3409 3345 4871  925 1717 3656  147  643 4211
  162 1792 4308 4233 1557 4104 4777 3448  474  686 4755 4428 259

In [6]:
res = tinybm.estimate_performance(p_irt=p_irt, gp_irt=gp_irt)

100%|██████████| 1/1 [00:00<00:00, 44.83it/s]
INFO:custom_tiny_bench.estimator:[Naive accuracy]: 0.620000
INFO:custom_tiny_bench.estimator:[IRT] predicted score for 0_th model in gqa: 0.624742
INFO:custom_tiny_bench.estimator:[Naive accuracy]: 0.660000
INFO:custom_tiny_bench.estimator:[IRT] predicted score for 0_th model in text-vqa: 0.630800
INFO:custom_tiny_bench.estimator:[Naive accuracy]: 0.800000
INFO:custom_tiny_bench.estimator:[IRT] predicted score for 0_th model in pope: 0.883951
INFO:custom_tiny_bench.estimator:[p-IRT] predicted score for 0_th model in gqa: 0.652167
INFO:custom_tiny_bench.estimator:[p-IRT] predicted score for 0_th model in text-vqa: 0.601193
INFO:custom_tiny_bench.estimator:[p-IRT] predicted score for 0_th model in pope: 0.886838
INFO:custom_tiny_bench.estimator:[gp-IRT] predicted score for 0_th model in gqa: 0.627060
INFO:custom_tiny_bench.estimator:[gp-IRT] predicted score for 0_th model in text-vqa: 0.629147
INFO:custom_tiny_bench.estimator:[gp-IRT] predict

In [7]:
import numpy as np 

# Random sampling 
qids = np.array(range(5000))
rng = np.random.default_rng(seed=42)

for scenario in tinybm.scenarios_position.keys():
    random_points = rng.choice(qids, size=number_item, replace=False)
    equal_weights = np.array([1/number_item for _ in range(number_item)])

    Y_random = tinybm.test_data[:,tinybm.scenarios_position[scenario]][:,random_points]
    Y_hat = (Y_random*equal_weights).sum(axis=1)
    Balanced_true = (tinybm.balance_weights*tinybm.test_data)[:,tinybm.scenarios_position[scenario]].mean(axis=1) 

    print(f"Result on {scenario}")
    for i in range(Balanced_true.shape[0]):
        print(f"Model {i}")
        print(f"    [Balanced true accruacy] {Balanced_true[i]:.5f}")
        print(f"    [Random] scenario: {scenario}, avg. error: {np.abs(Y_hat[i]-Balanced_true[i]):.5f}")
        print(f"    [IRT] scenario: {scenario}, avg. error: {np.abs(res[0][scenario][i]-Balanced_true[i]):.5f}")
        print(f"    [p-IRT] scenario: {scenario}, avg. error: {np.abs(res[1][scenario][i]-Balanced_true[i]):.5f}")
        print(f"    [gp-IRT] scenario: {scenario}, avg. error: {np.abs(res[2][scenario][i]-Balanced_true[i]):.5f}")


Result on gqa
Model 0
    [Balanced true accruacy] 0.64406
    [Random] scenario: gqa, avg. error: 0.03594
    [IRT] scenario: gqa, avg. error: 0.01932
    [p-IRT] scenario: gqa, avg. error: 0.00811
    [gp-IRT] scenario: gqa, avg. error: 0.01700
Result on text-vqa
Model 0
    [Balanced true accruacy] 0.62520
    [Random] scenario: text-vqa, avg. error: 0.01480
    [IRT] scenario: text-vqa, avg. error: 0.00560
    [p-IRT] scenario: text-vqa, avg. error: 0.02401
    [gp-IRT] scenario: text-vqa, avg. error: 0.00395
Result on pope
Model 0
    [Balanced true accruacy] 0.87430
    [Random] scenario: pope, avg. error: 0.04430
    [IRT] scenario: pope, avg. error: 0.00965
    [p-IRT] scenario: pope, avg. error: 0.01254
    [gp-IRT] scenario: pope, avg. error: 0.01035
