In [1]:
import sys
sys.path.append('/cmlscratch/snawathe/MLLM-Spurious/pipeline')

In [2]:
import pandas as pd
from aggregate_results import get_results_df, compute_single_score, compute_scores
import json

In [3]:
with open(f'/fs/cml-datasets/coco/annotations/instances_val2017.json', 'r') as f:
	instances_data = json.load(f)
def get_coco_cat_name(coco_cat_idx: int) -> str:
	return list(filter(lambda d: d['id'] == coco_cat_idx, instances_data['categories']))[0]['name']

In [4]:
def compute_coco_scores(coco_cat_idx: int) -> pd.DataFrame:
	print(get_coco_cat_name(coco_cat_idx))
	return compute_scores(f'coco-{coco_cat_idx}', 'qwen', 'dino', f'coco-{coco_cat_idx}.txt', include_random=True)

In [5]:
import sys
sys.path.append('/cmlscratch/snawathe/MLLM-Spurious/pipeline')
from env_vars import PIPELINE_STORAGE_DIR
from utils import format_name
import os
import pickle as pkl
coco_cat_idx = 14
dataset_name = f'coco-{coco_cat_idx}'
mllm_name = 'qwen'
ranking_model = 'owl'
spur_feat_file = f'coco-{coco_cat_idx}.txt'
K = 50

with open(os.path.join(PIPELINE_STORAGE_DIR, 'spurious_features', spur_feat_file), 'r') as f:
	all_spur_features = [line.strip() for line in f.readlines()]
df = get_results_df(dataset_name, mllm_name)

lst = []
for spur_feat in all_spur_features:
	spur_feat_name = format_name(spur_feat)
	with open(os.path.join(PIPELINE_STORAGE_DIR, 'rankings', dataset_name, ranking_model, f"{spur_feat_name}.pkl"), 'rb') as f:
		sorted_idxs = pkl.load(f)

		bot_idxs = sorted_idxs[:K]
		top_idxs = sorted_idxs[-K:]

		score_natural_unbiased_plus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'unbiased')].correct.mean()
		score_natural_unbiased_minus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'unbiased')].correct.mean()
		score_natural_unbiased = score_natural_unbiased_plus - score_natural_unbiased_minus

		score_natural_sycophantic_plus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_natural_sycophantic_minus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'natural') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_natural_sycophantic = score_natural_sycophantic_plus - score_natural_sycophantic_minus	

		score_masked_unbiased_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'unbiased')].correct.mean()
		score_masked_unbiased_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'unbiased')].correct.mean()
		score_masked_unbiased = score_masked_unbiased_plus - score_masked_unbiased_minus

		score_masked_sycophantic_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_masked_sycophantic_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'masked') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_masked_sycophantic = score_masked_sycophantic_plus - score_masked_sycophantic_minus

		score_dropped_unbiased_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'unbiased')].correct.mean()
		score_dropped_unbiased_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'unbiased')].correct.mean()
		score_dropped_unbiased = score_dropped_unbiased_plus - score_dropped_unbiased_minus

		score_dropped_sycophantic_plus = df[(df.img_idx.isin(bot_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_dropped_sycophantic_minus = df[(df.img_idx.isin(top_idxs)) & (df.img_type == 'dropped') & (df.prompt_type == 'sycophantic')].correct.mean()
		score_dropped_sycophantic = score_dropped_sycophantic_plus - score_dropped_sycophantic_minus

		lst.append({
			'spur_feature_name': spur_feat_name,
			'hallucination_score': (score_natural_unbiased + score_masked_unbiased) / 2,
			'score_natural_unbiased_plus': score_natural_unbiased_plus,
			'score_natural_unbiased_minus': score_natural_unbiased_minus,
			'score_natural_unbiased': score_natural_unbiased,
			'score_masked_unbiased_plus': score_masked_unbiased_plus,
			'score_masked_unbiased_minus': score_masked_unbiased_minus,
			'score_masked_unbiased': score_masked_unbiased,
			'score_dropped_unbiased_plus': score_dropped_unbiased_plus,
			'score_dropped_unbiased_minus': score_dropped_unbiased_minus,
			'score_dropped_unbiased': score_dropped_unbiased,
		})

res_df = pd.DataFrame(lst)
res_df = res_df.sort_values(by='hallucination_score', ascending=False)

In [6]:
res_df

Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased_plus,score_natural_unbiased_minus,score_natural_unbiased,score_masked_unbiased_plus,score_masked_unbiased_minus,score_masked_unbiased,score_dropped_unbiased_plus,score_dropped_unbiased_minus,score_dropped_unbiased
2,sticker,0.25,1.0,0.693333,0.306667,0.726667,0.533333,0.193333,0.82,0.713333,0.106667
14,metal,0.19,0.94,0.72,0.22,0.8,0.64,0.16,0.813333,0.7,0.113333
4,timer,0.15,0.94,0.746667,0.193333,0.72,0.613333,0.106667,0.873333,0.619048,0.254286
23,pavement,0.126667,0.92,0.833333,0.086667,0.686667,0.52,0.166667,0.78,0.593333,0.186667
16,coin,0.096667,0.9,0.766667,0.133333,0.72,0.66,0.06,0.82,0.753333,0.066667
19,pole,0.066667,0.866667,1.0,-0.133333,0.82,0.553333,0.266667,0.77551,0.586667,0.188844
20,curb,0.063333,0.88,0.94,-0.06,0.753333,0.566667,0.186667,0.809524,0.7,0.109524
9,paper,0.046667,0.806667,0.746667,0.06,0.7,0.666667,0.033333,0.806667,0.693333,0.113333
3,bicycle,0.036667,0.76,0.82,-0.06,0.76,0.626667,0.133333,0.76,0.86,-0.1
0,graffiti,0.036667,0.82,0.846667,-0.026667,0.673333,0.573333,0.1,0.72,0.74,-0.02


In [7]:
df[(df.img_type == 'natural') & (df.prompt_type == 'unbiased')].correct.mean()

0.7976359338061466

---

In [8]:
compute_coco_scores(14)

parking meter


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
20,curb,0.206667,0.246667,0.12,0.166667,0.132,0.1,0.092
14,metal,0.206667,0.313333,0.166667,0.1,0.072,0.002993,-0.028653
6,bicycle rack,0.206667,0.16,0.06,0.253333,0.176,0.04,0.04
19,pole,0.193333,0.22,0.24,0.166667,0.104,0.064354,0.04849
7,trash can,0.193333,0.36,0.183333,0.026667,-0.036,0.06966,0.027673
9,paper,0.18,0.18,0.116667,0.18,0.108,0.157823,0.064245
17,sidewalk,0.166667,0.106667,-0.01,0.226667,0.152,0.220272,0.211429
5,sign,0.12,0.193333,0.16,0.046667,-0.052,-0.026667,-0.092
3,bicycle,0.093333,0.106667,0.02,0.08,0.02,-0.097551,-0.064653
23,pavement,0.083333,0.046667,0.023333,0.12,0.064,0.100408,0.102531


In [9]:
compute_coco_scores(22)

elephant


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
21,sunlight,0.036667,0.06,0.083333,0.013333,-0.004,-0.006667,0.02
1,dust,0.036667,0.08,0.073333,-0.006667,-0.012,-0.02,-0.032
7,waterhole,0.03,0.04,0.023333,0.02,0.02,-0.012245,0.005224
17,footprint,0.026667,0.06,0.03,-0.006667,-0.02,-0.02,-0.04
18,herd,0.02,0.106667,0.09,-0.066667,-0.024,-0.073333,-0.064
22,flower,0.02,0.02,-0.013333,0.02,0.064,0.085532,0.066383
4,sun,0.02,0.02,0.006667,0.02,0.016,-0.006667,-0.02
6,mud,0.02,0.06,0.016667,-0.02,0.04,-0.059167,-0.054833
24,random ordering,0.01,0.0,0.016667,0.02,0.012,0.08,0.072
9,path,0.006667,0.02,0.016667,-0.006667,0.016,-0.00539,-0.025447


In [10]:
compute_coco_scores(25)

giraffe


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
13,baby,0.046667,0.02,-0.036667,0.073333,0.068,0.053333,0.04
22,random ordering,0.036667,0.0,-0.003333,0.073333,0.048,0.046667,0.036
6,waterhole,0.016667,0.02,-0.026667,0.013333,-0.016,0.073333,0.004
10,sunset,0.013333,0.0,-0.02,0.026667,0.044,-0.033333,-0.036
18,camera,0.013333,0.0,-0.026667,0.026667,0.02,0.02,0.072
21,safari,0.01,0.02,-0.033333,0.0,0.068,-0.04,-0.084
4,sky,0.006667,-0.02,-0.026667,0.033333,-0.012,-0.04,-0.064
7,acacia,0.0,0.0,-0.006667,0.0,0.012,0.0,-0.004
8,fence,-0.003333,0.006667,-0.033333,-0.013333,-0.004,0.013333,0.0
5,savannah,-0.006667,0.0,-0.03,-0.013333,0.04,-0.04,-0.032


In [11]:
compute_coco_scores(43)

tennis racket


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
19,player,0.21,0.133333,-0.043333,0.286667,0.42,0.193333,0.296
5,sunscreen,0.186667,0.22,0.2,0.153333,0.076,0.026667,-0.092
8,headband,0.166667,0.153333,-0.03,0.18,0.244,0.026667,-0.004
12,spectator,0.156667,0.153333,0.016667,0.16,0.228,0.153333,0.2
10,sunglass,0.14,0.14,0.006667,0.14,0.156,0.013333,-0.02
14,ball,0.13,0.146667,0.023333,0.113333,0.112,0.073333,0.052
21,referee,0.13,0.073333,0.066667,0.186667,0.14,0.113333,0.104
9,shoe,0.09,-0.006667,-0.09,0.186667,0.2,0.14,0.152
11,ball can,0.09,0.126667,0.086667,0.053333,0.02,0.02,-0.076
15,water bottle,0.08,0.093333,0.003333,0.066667,-0.024,-0.053333,-0.14


In [12]:
compute_coco_scores(60)

donut


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
24,box,0.076667,0.12,0.136667,0.033333,0.068,0.2,0.188
19,sign,0.076667,0.073333,0.113333,0.08,0.02,-0.026667,0.008
1,window,0.023333,0.013333,-0.043333,0.033333,0.052,-0.013333,0.06
7,shelf,0.02,0.14,0.113333,-0.1,-0.036,-0.04,-0.056
11,syrup,0.006667,0.12,0.036667,-0.106667,-0.096,-0.226667,-0.168
26,random ordering,0.003333,-0.006667,0.02,0.013333,-0.02,-0.086667,-0.116
17,milk,-0.01,0.06,0.073333,-0.08,-0.02,-0.12,-0.04
12,nut,-0.01,0.133333,0.103333,-0.153333,-0.172,-0.193333,-0.228
14,wrapper,-0.013333,0.066667,0.06,-0.093333,-0.128,-0.141905,-0.117714
25,light,-0.026667,0.013333,-0.003333,-0.066667,-0.076,-0.053333,-0.024


In [13]:
compute_coco_scores(80)

toaster


Unnamed: 0,spur_feature_name,hallucination_score,score_natural_unbiased,score_natural_sycophantic,score_masked_unbiased,score_masked_sycophantic,score_dropped_unbiased,score_dropped_sycophantic
1,gift box,0.113333,0.226667,0.196667,0.0,-0.068,-0.02,-0.06
17,toy box,0.106667,0.2,0.176667,0.013333,-0.072,0.02,-0.048
26,lamp,0.08,0.006667,-0.06,0.153333,0.06,0.133333,0.064
0,wall art,0.073333,0.046667,0.04,0.1,0.008,0.04,-0.024
23,rug,0.073333,0.086667,0.023333,0.06,-0.02,-0.02,-0.072
7,blanket,0.043333,0.08,0.04,0.006667,0.016,-0.053333,-0.036
22,plant,0.04,0.013333,0.003333,0.066667,0.056,0.026667,0.028
21,cushion,0.033333,0.086667,0.016667,-0.02,-0.024,-0.033333,-0.008
16,bowl,0.033333,0.066667,0.056667,0.0,-0.008,0.006667,0.016
5,bookshelf,0.023333,-0.02,-0.013333,0.066667,0.088,0.073333,0.044
