In [1]:
from pathlib import Path
import pandas as pd 
import json 
import numpy as np 


qfile = Path("./eval_data/llava-v1.5-7b/annotations-gqa-full.json")
pfile = Path("./eval_data/llava-v1.5-7b/gqa-formatted-predictions.json")

questions = {}
predictions = {}
with open(qfile) as file:
    questions = json.load(file)
with open(pfile) as file:
    predictions = json.load(file)

for k, q in questions.items():
    questions[k]['detailed_type'] = q['types']['detailed']
    questions[k]['semantic_type'] = q['types']['semantic']
    questions[k]['structural_type'] = q['types']['structural']

predictions = {str(p["questionId"]): p["prediction"] for p in predictions}

In [2]:
qdf = pd.DataFrame.from_dict(questions, orient='index')
pdf = pd.DataFrame.from_dict(predictions, orient='index')


In [3]:
print(len(qdf))
print(len(qdf.groupby("structural_type").size()))
questions
qdf.groupby('structural_type').count()

12578
5


Unnamed: 0_level_0,semantic,entailed,equivalent,question,imageId,isBalanced,groups,answer,semanticStr,annotations,types,fullAnswer,detailed_type,semantic_type
structural_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
choose,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129,1129
compare,589,589,589,589,589,589,589,589,589,589,589,589,589,589
logical,1803,1803,1803,1803,1803,1803,1803,1803,1803,1803,1803,1803,1803,1803
query,6805,6805,6805,6805,6805,6805,6805,6805,6805,6805,6805,6805,6805,6805
verify,2252,2252,2252,2252,2252,2252,2252,2252,2252,2252,2252,2252,2252,2252


In [4]:
from collections import defaultdict
from tqdm import tqdm 
import numpy as np 

def get_metric(predicted: str, answer: str) -> float:
    return 1 if predicted == answer else 0

def create_correctness_for_model(type_keyword: str,types: list[str], questions: dict, predictions: dict) -> dict:
    """
    
    Args: 
        type_keyword: name of the type keyword ex) structural_type, detailed_type 
        types: list of types ex) choose, logical etc 
        questions: format should be 
            key = 'question_id' 
            value = {'answer': answer_string, 'type': type_string} should at least include these fields. 
        predictions: format should be
            key = 'question_id' 
            value = answer_string 
    """
    data = {type : defaultdict(list) for type in types}

    for qid, question in tqdm(questions.items()):
        gold = question["answer"]
        predicted = predictions[qid]

        data[question[type_keyword]]['correctness'].append(get_metric(predicted, gold))
    
    return data 

def collect_correctness_per_type(correctness_dict: dict[str, dict[str, list]], models: list[str]) -> dict: 
    """
    Returns: 
        dictionary of 'data' and 'model' 
        'data' contains correctness per each subscenario (type). Shape is (number of prompts, number of models)
        'model' contains the name of all models 
    """
    data = {}
    data['data'] = {}
    data['models'] = models

    for ty in correctness_dict[list(correctness_dict.keys())[0]].keys():
        data['data'][ty] = {}
        data['data'][ty]['correctness'] = []

        for model in models:
            data['data'][ty]['correctness'].append(correctness_dict[model][ty]['correctness'])
                
        data['data'][ty]['correctness'] = np.array(data['data'][ty]['correctness']).T.astype(float)
    
    return data 

def prepare_responses(data, types: list[str]):
    """ Stack all responses of different subscenarios """
    responses = [np.vstack([data['data'][sub]['correctness'] for sub in types]).T]
    return np.hstack(responses)

types = ['choose', 'compare', 'logical', 'query', 'verify']
models = ['llava-v1.5-7b', 'instructblip-vicuna-7b', 'prism-clip+7b', 'prism-dinosiglip+7b', 'prism-siglip+7b']
data = {}
for model in models:
    data[model] = create_correctness_for_model('structural_type',types, questions, predictions)


100%|██████████| 12578/12578 [00:00<00:00, 1172563.03it/s]
100%|██████████| 12578/12578 [00:00<00:00, 1531866.65it/s]
100%|██████████| 12578/12578 [00:00<00:00, 2218034.72it/s]
100%|██████████| 12578/12578 [00:00<00:00, 2686422.02it/s]
100%|██████████| 12578/12578 [00:00<00:00, 3281455.23it/s]


In [5]:
data['llava-v1.5-7b']['choose']['correctness']

res = collect_correctness_per_type(data, models)
final_res = prepare_responses(res, types)
res['data']['choose']
# final_res.shape

(res['data'])

{'choose': {'correctness': array([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         ...,
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]])},
 'compare': {'correctness': array([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         ...,
         [1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]])},
 'logical': {'correctness': array([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]])},
 'query': {'correctness': array([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         ...,
         [0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]])},
 'verify': {'correctness': array([[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],


In [6]:
from utils import * 

scenarios = {'gqa':types} 
scenarios_position, subscenarios_position = prepare_data(scenarios, res)
subscenarios_position['gqa'].keys()

Y = create_responses(scenarios, res)
Y.shape

(5, 12578)

In [7]:
balance_weights = np.ones(Y.shape[1])

bm = 'gqa'
N = len(scenarios_position[bm])
n_sub = len(scenarios[bm])
for sub in scenarios[bm]:
    n_i = len(subscenarios_position[bm][sub])
    balance_weights[subscenarios_position[bm][sub]] = N/(n_sub*n_i)  

In [8]:
accs1 = np.mean([Y[:,subscenarios_position[bm][sub]].mean(axis=1) for sub in scenarios[bm]], axis=0)
accs2 = (balance_weights*Y)[:,scenarios_position[bm]].mean(axis=1)

np.abs(accs1 - accs2).mean()

8.237854842718662e-14

In [9]:
Y_bin_train = Y[:4]
Y_bin_test = Y[4:]


In [10]:
from irt import * 

scenarios = {'gqa':types} 

Ds = [1] # Dimensions to try
device = 'cpu' # Either 'cuda' or 'cpu' 
epochs = 10  # Number of epochs for IRT model training (py-irt default is 2000)
lr = .1  # Learning rate for IRT model training (py-irt default is .1)

val_ind = list(range(0,Y_bin_train.shape[0],5)) # Validation indices
train_ind = [i for i in range(Y_bin_train.shape[0]) if i not in val_ind]

# Saving the training dataset in the needed format
create_irt_dataset(Y_bin_train[train_ind], 'data/irt_val_dataset.jsonlines')

# Trying different Ds
errors = []  
errors2 = []

for D in tqdm(Ds):
    dataset_name = 'data/irt_val_dataset.jsonlines'
    model_name = 'data/irt_val_model/'
    
    # Load trained IRT model parameters
    train_irt_model(dataset_name, model_name, D, lr, epochs, device)
    A, B, Theta = load_irt_parameters(model_name)
    
    # Determine seen and unseen items for validation
    seen_items = list(range(0, Y_bin_train.shape[1], 2))
    unseen_items = list(range(1, Y_bin_train.shape[1], 2))

    # Estimate ability parameters for the validation set
    thetas = [estimate_ability_parameters(Y_bin_train[val_ind][j][seen_items], A[:, :, seen_items], B[:, :, seen_items]) for j in range(len(val_ind))]

    # Compute validation errors for each scenario and update the errors list (in the end, we give the same weight for all scenarios)
    errors2.append([])
    for scenario in scenarios.keys():
        ind = [u for u in unseen_items if u in scenarios_position[scenario]]
        errors2[-1].append(np.mean([abs((balance_weights*item_curve(thetas[j], A, B))[0,ind].mean()-Y_bin_train[val_ind][j,ind].mean()) for j in range(len(val_ind))]))
    errors.append(np.mean(errors2[-1]))

  0%|          | 0/1 [00:00<?, ?it/s]

[13:30:48] config: model_type='multidim_2pl' epochs=10                cli.py:109
           priors='hierarchical' initializers=[] dims=1 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_val_dataset.jsonlines                  cli.py:111
           output directory: data/irt_val_model/                      cli.py:112
[13:30:48] amortized: False                                       dataset.py:112
[13:30:48] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 3}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


In [11]:
ind_D = np.argmin(np.array(errors))
D = Ds[ind_D]

In [12]:
create_irt_dataset(Y_bin_train, 'data/irt_dataset.jsonlines')

In [13]:
train_irt_model(dataset_name='data/irt_dataset.jsonlines', 
                model_name='data/irt_model', 
                D=D, lr=lr, epochs=epochs, device=device)               

[13:30:50] config: model_type='multidim_2pl' epochs=10                cli.py:109
           priors='hierarchical' initializers=[] dims=1 lr=0.1                  
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_dataset.jsonlines                      cli.py:111
           output directory: data/irt_model                           cli.py:112
[13:30:50] amortized: False                                       dataset.py:112
[13:30:50] Vocab size: None                                       training.py:90
           Training Model...                                          cli.py:116
           args: {'device': 'cpu', 'num_items': 12578,           training.py:134
           'num_subjects': 4}                                                   
           Parsed Model Args: {'device': 'cpu', 'num_items':     training.py:147
           12578, 'num_subje

In [14]:
import pickle

def get_lambda(b, v):
    return (b**2)/(v+(b**2))

number_item = 100

lambds = {} 

for i,scenario in enumerate(scenarios.keys()):
    v = np.var(Y_bin_train[:,scenarios_position[scenario]], axis=1).mean()
    b = np.mean(errors2[ind_D][i]) 
    lambds[scenario] = get_lambda(b, v/(4*number_item))

with open('data/lambds.pickle', 'wb') as handle:
    pickle.dump(lambds, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Generate Anchor points

In [15]:
clustering = 'irt' # 'correct.' or 'irt'

In [16]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances

random_state = 42

anchor_points = {}
anchor_weights = {}

for scenario in scenarios.keys():

    if clustering=='correct.':
        X = Y_bin_train[:,scenarios_position[scenario]].T
    elif clustering=='irt':
        A, B, _ = load_irt_parameters('data/irt_model/')
        X = np.vstack((A.squeeze(), B.squeeze().reshape((1,-1)))).T
        X = X[scenarios_position[scenario]]
    else:
        raise NotImplementedError 
        
    #Normalizing balance_weights, so their sum is one within each scenario
    norm_balance_weights = balance_weights[scenarios_position[scenario]]
    norm_balance_weights /= norm_balance_weights.sum()

    # Fitting the KMeans model
    kmeans = KMeans(n_clusters=number_item, n_init="auto", random_state=random_state)
    kmeans.fit(X, sample_weight=norm_balance_weights)

    # Calculating anchor points
    anchor_points[scenario] = pairwise_distances(kmeans.cluster_centers_, X, metric='euclidean').argmin(axis=1)

    # Calculating anchor weights
    anchor_weights[scenario] = np.array([np.sum(norm_balance_weights[kmeans.labels_==c]) for c in range(number_item)])

In [18]:
anchor = {'anchor_points':anchor_points,
          'anchor_weights':anchor_weights}

with open('data/anchor.pickle', 'wb') as handle:
    pickle.dump(anchor, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
for scenario in scenarios.keys():
    Y_anchor = Y_bin_test[:,scenarios_position[scenario]][:,anchor_points[scenario]]
    Y_hat = (Y_anchor*anchor_weights[scenario]).sum(axis=1)
    Y_true = (balance_weights*Y_bin_test)[:,scenarios_position[scenario]].mean(axis=1)

    print(f"scenario: {scenario}, avg. error: {np.abs(Y_hat-Y_true).mean():.3f}")

scenario: gqa, avg. error: 0.022


# Estimate performance

In [21]:
A, B, _ = load_irt_parameters('data/irt_model/')
seen_items = np.hstack([np.array(scenarios_position[scenario])[anchor_points[scenario]] for scenario in scenarios.keys()]).tolist()
unseen_items = [i for i in range(Y_bin_train.shape[1]) if i not in seen_items]

In [23]:
thetas = [estimate_ability_parameters(Y_bin_test[j][seen_items], A[:, :, seen_items], B[:, :, seen_items]) for j in tqdm(range(Y_bin_test.shape[0]))]

100%|██████████| 1/1 [00:00<00:00, 122.83it/s]


In [24]:
pirt_preds = {}
for scenario in scenarios.keys():

    ind_seen = [u for u in seen_items if u in scenarios_position[scenario]]
    ind_unseen = [u for u in unseen_items if u in scenarios_position[scenario]]
    pirt_lambd = Y_anchor.shape[1]/len(scenarios_position[scenario])

    pirt_pred = []
    
    for j in range(Y_bin_test.shape[0]):
        data_part = (balance_weights*Y_bin_test)[j,ind_seen].mean()
        irt_part = (balance_weights*item_curve(thetas[j], A, B))[0,ind_unseen].mean()
        pirt_pred.append(pirt_lambd*data_part + (1-pirt_lambd)*irt_part) 
        
    pirt_preds[scenario] = np.array(pirt_pred) # Predictions
    true = (balance_weights*Y_bin_test)[:,scenarios_position[scenario]].mean(axis=1) # True performance
    
    print(f"scenario: {scenario}, avg. error: {np.abs(pirt_preds[scenario]-true).mean():.3f}")

scenario: gqa, avg. error: 0.174


In [25]:
with open('data/lambds.pickle', 'rb') as handle:
    lambds = pickle.load(handle)

In [26]:
preds = {}
for scenario in scenarios.keys():
    Y_anchor = Y_bin_test[:,scenarios_position[scenario]][:,anchor_points[scenario]]
    preds[scenario] = (Y_anchor*anchor_weights[scenario]).sum(axis=1) # Predictions
    true = (balance_weights*Y_bin_test)[:,scenarios_position[scenario]].mean(axis=1) # True performance

    print(f"scenario: {scenario}, avg. error: {np.abs(preds[scenario]-true).mean():.3f}")

scenario: gqa, avg. error: 0.022


In [28]:
gpirt_preds = {}
for scenario in scenarios.keys():
    gpirt_preds[scenario] = lambds[scenario]*preds[scenario]  + (1-lambds[scenario])*pirt_preds[scenario]
    true = (balance_weights*Y_bin_test)[:,scenarios_position[scenario]].mean(axis=1) # True performance
    
    print(f"Prediction: {gpirt_preds[scenario]} vs True: {true}")
    print(f"scenario: {scenario}, avg. error: {np.abs(gpirt_preds[scenario]-true).mean():.3f}")

Prediction: [0.67841155] vs True: [0.70793229]
scenario: gqa, avg. error: 0.030
