In [1]:
from pathlib import Path
from typing import Literal
from custom_tiny_bench.processor.benchmark_processor import (
    BenchmarkConfig,
    EvaluationResult,
)
from custom_tiny_bench.tiny_benchmark import TinyBenchmark
import logging
logging.basicConfig(level=logging.DEBUG,
                    force = True)

save_dir = Path("data")
bm_configs: list[BenchmarkConfig] = [
    BenchmarkConfig(
        name="gqa",
        results=[
            EvaluationResult(
                prediction_file="data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json",
                model="instructblip-vicuna-7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json",
                model="llava-v1.5-7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/prism-clip+7b/gqa-formatted-predictions.json",
                model="prism-clip+7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/prism-dinosiglip+7b/gqa-formatted-predictions.json",
                model="prism-dinosiglip+7b",
            ),
            EvaluationResult(
                prediction_file="data/gqa/prism-siglip+7b/gqa-formatted-predictions.json",
                model="prism-siglip+7b",
            ),
        ],
        question_file="data/gqa/questions.json",
        subscenario_keyword="structural_type"
    )
]
train_size: int | float = 0.8
device = "cpu"
number_item: int = 100
random_state: int = 42
clustering: Literal["irt", "correct."] = "irt"
p_irt: bool = True
gp_irt = True
epochs = 2000


In [2]:
tinybm = TinyBenchmark(save_dir)
# print(tinybm.bm_to_proc["gqa"].predictions.predictions_per_model)
# tinybm.prepare_data(bm_configs)
# tinybm.train_irt(train_size, device)
# tinybm.get_anchors(number_item, random_state, clusterting)
# tinybm.estimate_performance(p_irt, gp_irt)


In [3]:
tinybm.prepare_data(bm_configs)
tinybm.bm_to_proc["gqa"].predictions

INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/questions.json


Config: ['instructblip-vicuna-7b', 'llava-v1.5-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']


INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/instructblip-vicuna-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/llava-v1.5-7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/prism-clip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/prism-dinosiglip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 12578
INFO:custom_tiny_bench.processor.benchmark_processor:Opening data/gqa/prism-siglip+7b/gqa-formatted-predictions.json
INFO:custom_tiny_bench.processor.benchmark_processor:Number of predictions: 1257

PredictionDict(predictions_per_model={'instructblip-vicuna-7b': {'20866135': Prediction(question_id='20866135', prediction='yes'), '20508516': Prediction(question_id='20508516', prediction='remote'), '2044579': Prediction(question_id='2044579', prediction='steps'), '201997014': Prediction(question_id='201997014', prediction='yes'), '20679267': Prediction(question_id='20679267', prediction='large'), '201983816': Prediction(question_id='201983816', prediction='no'), '20302888': Prediction(question_id='20302888', prediction='yes'), '202169063': Prediction(question_id='202169063', prediction='none'), '20654958': Prediction(question_id='20654958', prediction='no'), '201713532': Prediction(question_id='201713532', prediction='bathroom'), '20963966': Prediction(question_id='20963966', prediction='sink'), '20381201': Prediction(question_id='20381201', prediction='no'), '2056027': Prediction(question_id='2056027', prediction='yes'), '202179459': Prediction(question_id='202179459', prediction='b

In [4]:
tinybm.train_irt(train_size, device, epochs)

  0%|          | 0/2 [00:00<?, ?it/s]

[18:29:22] config: model_type='multidim_2pl' epochs=10                cli.py:109
           priors='hierarchical' initializers=[] dims=5 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_val_dataset.jsonlines                  cli.py:111
           output directory: data/irt_val_model                       cli.py:112
[18:29:22] amortized: False                                       dataset.py:112
[18:29:22] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

 50%|█████     | 1/2 [00:02<00:02,  2.50s/it]

[18:29:24] config: model_type='multidim_2pl' epochs=10                cli.py:109
           priors='hierarchical' initializers=[] dims=10 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_val_dataset.jsonlines                  cli.py:111
           output directory: data/irt_val_model                       cli.py:112
[18:29:24] amortized: False                                       dataset.py:112
[18:29:24] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

100%|██████████| 2/2 [00:04<00:00,  2.45s/it]


[18:29:27] config: model_type='multidim_2pl' epochs=10                cli.py:109
           priors='hierarchical' initializers=[] dims=5 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_dataset.jsonlines                      cli.py:111
           output directory: data/irt_model                           cli.py:112
[18:29:27] amortized: False                                       dataset.py:112
[18:29:27] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 4}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

In [5]:
anchor = tinybm.get_anchors(number_item, random_state, clustering= clustering)


INFO:custom_tiny_bench.tiny_benchmark:[Anchor points] scenario: gqa, avg. error: 0.0255956974


Points [ 6537  3849 11164  5848   198  6870  4008 11397  4996  6468  8970 12186
 11862  9898  1434  3496 11235 12051  4437   659     8 10888  3295  1159
  9766  8475  1752 11819  6231  6696  9043 10056  8006  9738  2931  4242
  8829  5468  2444   738 10280  4910  6139  4593  5421  9601    63  1017
  3559  2871  2254  8039  4855  4548  4139  3815  6835 10951  5012  7271
  7577  9188  2765 12006  9697  3420  2957 12457  4424  9421  3618  6100
  7321  5423  5476  5519  8841   962  4357  9334  5610  8018   447  1997
   674 11205  4136  2412 12094 11012  4348  5823  4895  7771   166  7183
 11568 11152 10800  5934]


In [8]:
res = tinybm.estimate_performance(p_irt=p_irt, gp_irt=gp_irt)

100%|██████████| 5/5 [00:00<00:00, 364.01it/s]


[IRT] predicted score for 0_th model in gqa: 0.586282
[IRT] predicted score for 1_th model in gqa: 0.696034
[IRT] predicted score for 2_th model in gqa: 0.687717
[IRT] predicted score for 3_th model in gqa: 0.722701
[IRT] predicted score for 4_th model in gqa: 0.718156
[p-IRT] predicted score for 0_th model in gqa: 0.592847
[p-IRT] predicted score for 1_th model in gqa: 0.705815
[p-IRT] predicted score for 2_th model in gqa: 0.692823
[p-IRT] predicted score for 3_th model in gqa: 0.697896
[p-IRT] predicted score for 4_th model in gqa: 0.687948
[gp-IRT] predicted score for 0_th model in gqa: 0.586464
[gp-IRT] predicted score for 1_th model in gqa: 0.696304
[gp-IRT] predicted score for 2_th model in gqa: 0.687858
[gp-IRT] predicted score for 3_th model in gqa: 0.722015
[gp-IRT] predicted score for 4_th model in gqa: 0.717320


In [14]:
true_score = tinybm.correctness_array[4,:].mean()

irt_score = res[0]['gqa'][0]
p_irt_score = res[1]['gqa'][0]
gp_irt_score = res[2]['gqa'][0]

print("IRT err", abs(true_score - irt_score))
print("p-IRT err", abs(true_score - p_irt_score))
print("gp-IRT err", abs(true_score - gp_irt_score))

IRT err 0.05777895133912647
p-IRT err 0.05121370539508274
gp-IRT err 0.05759740673229197


In [29]:
import numpy as np 

# Random sampling 
qids = np.array(range(12578))
rng = np.random.default_rng(seed=42)

for scenario in tinybm.scenarios_position.keys():
    random_points = rng.choice(qids, size=number_item, replace=False)
    equal_weights = np.array([1/number_item for _ in range(number_item)])

    Y_random = tinybm.test_data[:,tinybm.scenarios_position[scenario]][:,random_points]
    Y_hat = (Y_random*equal_weights).sum(axis=1)
    true = (tinybm.balance_weights*tinybm.test_data)[:,tinybm.scenarios_position[scenario]].mean(axis=1) 

    print(f"[Random] scenario: {scenario}, avg. error: {np.abs(Y_hat-true).mean():.3f}")


[Random] scenario: gqa, avg. error: 0.145
