In [3]:
import pandas as pd
from aggregate_results import get_results_df, compute_single_score, compute_scores
import json

In [4]:
with open(f'/fs/cml-datasets/coco/annotations/instances_val2017.json', 'r') as f:
	instances_data = json.load(f)
def get_coco_cat_name(coco_cat_idx: int) -> str:
	return list(filter(lambda d: d['id'] == coco_cat_idx, instances_data['categories']))[0]['name']

In [5]:
def compute_coco_scores(coco_cat_idx: int) -> pd.DataFrame:
	print(get_coco_cat_name(coco_cat_idx))
	return compute_scores(f'coco-{coco_cat_idx}', 'qwen', 'owl', f'coco-{coco_cat_idx}.txt', include_random=True)

In [13]:
import sys
sys.path.append('/cmlscratch/snawathe/MLLM-Spurious/pipeline')
from env_vars import PIPELINE_STORAGE_DIR
from utils import format_name
import os
import pickle as pkl
coco_cat_idx = 14
dataset_name = f'coco-{coco_cat_idx}'
mllm_name = 'qwen'
ranking_model = 'owl'
spur_feat_file = f'coco-{coco_cat_idx}.txt'
K = 50

with open(os.path.join(PIPELINE_STORAGE_DIR, 'spurious_features', spur_feat_file), 'r') as f:
	all_spur_features = [line.strip() for line in f.readlines()]
df = get_results_df(dataset_name, mllm_name)

lst = []
for spur_feat in all_spur_features:
	spur_feat_name = format_name(spur_feat)
	with open(os.path.join(PIPELINE_STORAGE_DIR, 'rankings', dataset_name, ranking_model, f"{spur_feat_name}.pkl"), 'rb') as f:
		sorted_idxs = pkl.load(f)

		bot_idxs = sorted_idxs[:K]
		top_idxs = sorted_idxs[-K:]

		score_natural_unbiased_plus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'unbiased')].correct.mean()
		score_natural_unbiased_minus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'unbiased')].correct.mean()
		score_natural_unbiased = score_natural_unbiased_plus - score_natural_unbiased_minus

		score_natural_sycophantic_plus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_natural_sycophantic_minus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_natural_sycophantic = score_natural_sycophantic_plus - score_natural_sycophantic_minus	

		score_masked_unbiased_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'unbiased')].correct.mean()
		score_masked_unbiased_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'unbiased')].correct.mean()
		score_masked_unbiased = score_masked_unbiased_plus - score_masked_unbiased_minus

		score_masked_sycophantic_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_masked_sycophantic_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_masked_sycophantic = score_masked_sycophantic_plus - score_masked_sycophantic_minus

		score_dropped_unbiased_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'unbiased')].correct.mean()
		score_dropped_unbiased_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'unbiased')].correct.mean()
		score_dropped_unbiased = score_dropped_unbiased_plus - score_dropped_unbiased_minus

		score_dropped_sycophantic_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_dropped_sycophantic_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_dropped_sycophantic = score_dropped_sycophantic_plus - score_dropped_sycophantic_minus

		lst.append({
			'spur_feature_name': spur_feat_name,
			'hallucination_score': (score_natural_unbiased + score_masked_unbiased) / 2,
			'score_natural_unbiased_plus': score_natural_unbiased_plus,
			'score_natural_unbiased_minus': score_natural_unbiased_minus,
			'score_natural_unbiased': score_natural_unbiased,
			'score_masked_unbiased_plus': score_masked_unbiased_plus,
			'score_masked_unbiased_minus': score_masked_unbiased_minus,
			'score_masked_unbiased': score_masked_unbiased,
			'score_dropped_unbiased_plus': score_dropped_unbiased_plus,
			'score_dropped_unbiased_minus': score_dropped_unbiased_minus,
			'score_dropped_unbiased': score_dropped_unbiased,
		})

res_df = pd.DataFrame(lst)
res_df = res_df.sort_values(by='hallucination_score', ascending=False)

In [14]:
res_df

Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased_plus,score_natural_unbiased_minus,score_natural_unbiased,score_masked_unbiased_plus,score_masked_unbiased_minus,score_masked_unbiased,score_dropped_unbiased_plus,score_dropped_unbiased_minus,score_dropped_unbiased
2,sticker,0.25,1.0,0.693333,0.306667,0.726667,0.533333,0.193333,0.82,0.713333,0.106667
14,metal,0.19,0.94,0.72,0.22,0.8,0.64,0.16,0.813333,0.7,0.113333
4,timer,0.15,0.94,0.746667,0.193333,0.72,0.613333,0.106667,0.873333,0.619048,0.254286
23,pavement,0.126667,0.92,0.833333,0.086667,0.686667,0.52,0.166667,0.78,0.593333,0.186667
16,coin,0.096667,0.9,0.766667,0.133333,0.72,0.66,0.06,0.82,0.753333,0.066667
19,pole,0.066667,0.866667,1.0,-0.133333,0.82,0.553333,0.266667,0.77551,0.586667,0.188844
20,curb,0.063333,0.88,0.94,-0.06,0.753333,0.566667,0.186667,0.809524,0.7,0.109524
9,paper,0.046667,0.806667,0.746667,0.06,0.7,0.666667,0.033333,0.806667,0.693333,0.113333
3,bicycle,0.036667,0.76,0.82,-0.06,0.76,0.626667,0.133333,0.76,0.86,-0.1
0,graffiti,0.036667,0.82,0.846667,-0.026667,0.673333,0.573333,0.1,0.72,0.74,-0.02


In [16]:
df[(df.img_type == 'natural') & (df.prompt_type == 'unbiased')].correct.mean()

0.7976359338061466

---

In [6]:
compute_coco_scores(14)

parking meter


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
2,sticker,0.25,0.306667,0.263333,0.193333,0.164,0.106667,0.0
14,metal,0.19,0.22,0.12,0.16,0.048,0.113333,0.024
4,timer,0.15,0.193333,0.176667,0.106667,0.084,0.254286,0.134449
23,pavement,0.126667,0.086667,0.07,0.166667,0.136,0.186667,0.132
16,coin,0.096667,0.133333,0.08,0.06,0.004,0.066667,-0.008
19,pole,0.066667,-0.133333,-0.083333,0.266667,0.316,0.188844,0.321959
20,curb,0.063333,-0.06,-0.08,0.186667,0.268,0.109524,0.282122
9,paper,0.046667,0.06,-0.003333,0.033333,0.044,0.113333,0.0
3,bicycle,0.036667,-0.06,-0.143333,0.133333,0.052,-0.1,-0.024
0,graffiti,0.036667,-0.026667,-0.033333,0.1,0.04,-0.02,-0.02


In [7]:
compute_coco_scores(22)

elephant


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
6,mud,0.023333,0.06,0.07,-0.013333,-0.004,0.04,0.039673
16,safari vehicle,0.013333,0.02,0.013333,0.006667,-0.012,0.04,0.032
13,hill,0.013333,0.02,0.016667,0.006667,-0.04,0.066667,0.044
7,waterhole,0.01,0.02,-0.013333,0.0,0.0,0.026667,0.004
9,path,0.003333,0.02,0.016667,-0.013333,-0.04,-0.006667,-0.012
10,bush,0.0,0.0,0.01,0.0,-0.012,0.013333,0.008
18,herd,0.0,0.02,-0.026667,-0.02,0.008,0.1,0.116
17,footprint,0.0,0.02,0.04,-0.02,-0.012,0.046667,0.048
1,dust,0.0,0.0,-0.01,0.0,0.012,0.053333,0.096
20,rock,0.0,0.02,0.016667,-0.02,-0.028,0.06,0.055592


In [8]:
compute_coco_scores(25)

giraffe


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
6,waterhole,0.076667,0.0,-0.026667,0.153333,0.06,0.133333,0.108
13,baby,0.04,0.0,-0.016667,0.08,0.084,0.106667,0.144
12,shade,0.03,0.0,0.0,0.06,0.024,0.073333,0.088
21,safari,0.01,0.0,0.083333,0.02,0.076,0.06,0.08
14,hill,0.01,0.0,0.006667,0.02,-0.036,-0.02,0.008
20,vine,0.01,-0.006667,0.003333,0.026667,0.02,0.046667,0.028
22,random ordering,0.006667,0.0,-0.003333,0.013333,-0.02,0.026667,0.016
18,camera,0.006667,0.0,0.023333,0.013333,0.044,0.073333,0.176
7,acacia,0.0,0.0,-0.016667,0.0,-0.06,-0.04,-0.064
17,other animals,0.0,0.0,0.03,0.0,-0.024,0.04,0.032


In [9]:
compute_coco_scores(43)

tennis racket


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
16,court net,0.22,0.333333,0.27,0.106667,0.292,0.133333,0.316
7,line,0.133333,0.04,-0.046667,0.226667,0.224,0.286667,0.252
17,court,0.113333,0.12,0.1,0.106667,0.192,0.06,0.12
6,fence,0.053333,-0.04,-0.066667,0.146667,0.088,0.08,0.056
11,ball can,0.053333,0.033333,-0.066667,0.073333,0.068,0.073333,0.108
8,headband,0.033333,0.02,-0.003333,0.046667,0.052,-0.02,0.044
3,elbow guard,0.03,0.013333,0.03,0.046667,0.06,-0.093333,0.004
18,towel,0.02,0.0,-0.073333,0.04,0.048,-0.013333,0.024
22,random ordering,0.006667,-0.013333,0.0,0.026667,0.028,-0.046667,-0.04
15,water bottle,0.0,-0.04,-0.03,0.04,0.064,-0.033333,0.048


In [10]:
compute_coco_scores(60)

donut


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
14,wrapper,0.153333,0.06,-0.003333,0.246667,0.172,0.273333,0.216
19,sign,0.13,-0.046667,-0.046667,0.306667,0.22,0.246667,0.216
18,sprinkle,0.126667,0.16,0.163333,0.093333,0.104,0.073333,0.08
25,light,0.113333,-0.013333,-0.08,0.24,0.14,0.2,0.124
3,sugar,0.09,0.16,0.26,0.02,0.064,0.066667,0.124
7,shelf,0.09,-0.06,-0.12,0.24,0.164,0.173333,0.148
6,glaze,0.053333,0.1,0.083333,0.006667,0.044,-0.013333,0.036
2,coffee,0.046667,0.146667,0.116667,-0.053333,-0.056,-0.06,-0.032
13,counter,0.036667,-0.006667,-0.08,0.08,0.104,0.126667,0.136
12,nut,0.013333,0.033333,0.04,-0.006667,0.008,0.06,0.08


In [11]:
compute_coco_scores(80)

toaster


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
4,photo frame,0.09,0.04,0.026667,0.14,0.108,0.066667,0.064
2,window,0.05,-0.02,-0.02,0.12,0.136,0.106667,0.092
7,blanket,0.046667,-0.013333,-0.033333,0.106667,0.1,0.153333,0.156
28,random ordering,0.043333,0.073333,0.05,0.013333,0.0,-0.026667,-0.04
22,plant,0.043333,0.006667,-0.026667,0.08,0.084,0.126667,0.112
9,picture frame,0.036667,0.02,-0.033333,0.053333,0.064,0.1,0.12
25,flower,0.023333,-0.053333,-0.04,0.1,0.072,0.1,0.068
15,blank card,0.023333,-0.02,-0.036667,0.066667,0.052,0.113333,0.084
0,wall art,0.02,-0.006667,-0.05,0.046667,0.044,0.046667,0.012
17,toy box,0.016667,-0.066667,-0.04,0.1,0.072,0.106667,0.1
