In [89]:
import pandas as pd

### Get distance metrics dataframe

In [90]:
distance_metrics = pd.read_pickle('../comparison_metrics/2_xai_with_distances.pickle')
distance_metrics.columns

Index(['filename', 'index', 'img_path', 'caption', 'foil',
       'linguistic_phenomena', 'model_shap_map', 'n_image_variants', 'model',
       'model_shap_positive_normalized', 'human_map',
       'shuffled_column_human_map', 'EMD_to_human', 'RC_to_human',
       'EMD_to_scrambled_human', 'RC_to_scrambled_human',
       'EMD_to_shuffled_column_human', 'RC_to_shuffled_column_human'],
      dtype='object')

In [91]:
len(distance_metrics)

368

### Get each model's output on each of 99 stimuli into a single dataframe

Then filter down to 92 stimuli

In [92]:
import os

models = ["LXMERT", "CLIP", "FLAVA", "SigLip"]

#put all models' stimuli output data into a single dataframe
stimuli_outputs = [pd.read_pickle(f"../xai_maps/xai_output/{model_name}_stimuli_output_scores.pickle") for model_name in models]

#add model names column
for model_name, model_df in zip(models, stimuli_outputs):
    model_df['model']=model_name

stimuli_outputs = pd.concat(stimuli_outputs)

stimuli_outputs['filename'] = [os.path.basename(path) for path in stimuli_outputs['img_path']]

# now filter down to 92 stimuli which have a human distance (because we were able to generate a human map for them!)
stimuli_outputs = stimuli_outputs[stimuli_outputs['filename'].isin(list(distance_metrics['filename']))]

len(stimuli_outputs)

368

In [93]:
stimuli_outputs.columns

Index(['img_path', 'caption', 'foil', 'linguistic_phenomena',
       'model_pred_caption', 'model_pred_foil', 'model_pred_diff', 'model',
       'filename'],
      dtype='object')

### Get all VALSE outputs for each model into a single dataframe

This includes ALL outputs including for things that aren't stimuli in our experiment

In [94]:
models = ["LXMERT", "CLIP", "FLAVA", "SigLip"]

#put all models' stimuli output data into a single dataframe
valse_outputs = [pd.read_pickle(f"../xai_maps/xai_output/{model_name}_all_of_valse_output_scores.pickle") for model_name in models]

#add model names column
for model_name, model_df in zip(models, valse_outputs):
    model_df['model']=model_name

valse_outputs = pd.concat(valse_outputs).reset_index()

valse_outputs['filename'] = [os.path.basename(path) for path in valse_outputs['img_path']]

len(valse_outputs)


10548

In [95]:
valse_outputs.tail(1)

Unnamed: 0,index,img_path,caption,foil,linguistic_phenomena,model_pred_caption,model_pred_foil,model_pred_diff,model,filename
10547,2636,../../VALSE_data/images/swig/rotting_182.jpg,A hamburger rots the table.,A table rots like a hamburger.,actions,-8.456455,-6.422833,-2.033622,SigLip,rotting_182.jpg


Export

# 1. Create table of distance metrics and model outputs for main 92 sitmuli

### Define which columns to keep from each DF

In [96]:
distance_metrics_filtered = distance_metrics[['filename', 'caption', 'foil', 'linguistic_phenomena', 'model', 'model_shap_positive_normalized', 'human_map', 'RC_to_human','EMD_to_human']]
stimuli_outputs_filtered = stimuli_outputs[['filename', 'model', 'model_pred_caption', 'model_pred_foil', 'model_pred_diff']]

merged_df = pd.merge(distance_metrics_filtered, stimuli_outputs_filtered, on=['filename','model'])
merged_df

Unnamed: 0,filename,caption,foil,linguistic_phenomena,model,model_shap_positive_normalized,human_map,RC_to_human,EMD_to_human,model_pred_caption,model_pred_foil,model_pred_diff
0,licking_291.jpg,A cow licks its fur.,A cow wrinkles its fur.,actions,LXMERT,"[[0.07359886674465907, 0.02998819329604594, 0....","[[0.0014743265527438847, 0.01573516504970489, ...",-0.058824,0.983263,0.004516,0.018551,-0.014034
1,counting_103.jpg,A person counts money.,A person pays money.,actions,LXMERT,"[[0.10524416414966785, 0.13240805706235903, 0....","[[0.004482754295119378, 0.029023927486055416, ...",-0.464706,1.118272,0.001854,0.001183,0.000671
2,v7w_2411632.jpg,There are no people in the photo.,There is at least one person in the photo.,existence,LXMERT,"[[0.005573892832708572, 0.0672021870488928, 0....","[[7.25101021142701e-07, 0.006881846083901585, ...",0.300000,0.897321,0.516299,0.030346,0.485952
3,urinating_251.jpg,A man urinates against a wall.,A man skates against a wall.,actions,LXMERT,"[[0.0298257918219853, 0.13789591026754916, 0.0...","[[0.012884411289709949, 0.0253810418266228, 0....",0.338235,0.714441,0.602664,0.011472,0.591192
4,v7w_2390850.jpg,There are no people in the water.,There is at least one person in the water.,existence,LXMERT,"[[0.034739605778902866, 0.05373696695722698, 0...","[[0.04304823942671234, 0.3141493061042169, 0.2...",0.522446,1.023692,0.011590,0.001473,0.010117
...,...,...,...,...,...,...,...,...,...,...,...,...
363,v7w_2363335.jpg,There is a mast on the closest boat.,There is no mast on the closest boat.,existence,FLAVA,"[[0.10762498903891778, 0.10600056876510631, 0....","[[0.0, 0.0, 0.0, 0.0], [0.0038761918414842236,...",-0.265192,1.613585,14.154673,14.714719,-0.560046
364,v7w_2367084.jpg,There is a giraffe.,There is no giraffe.,existence,FLAVA,"[[0.11606520796315897, 0.0032524290430023346, ...","[[0.058031671761254025, 0.011923543607465443, ...",0.382132,0.811894,15.711895,15.363859,0.348036
365,v7w_2373121.jpg,There are no clouds in the sky.,There is at least one cloud in the sky.,existence,FLAVA,"[[0.1333684719184068, 0.02332404943837514, 0.0...","[[0.06944438863843619, 0.3078465814086114, 0.2...",0.185431,1.103381,9.280195,9.946142,-0.665947
366,v7w_2390850.jpg,There are no people in the water.,There is at least one person in the water.,existence,FLAVA,"[[0.010906337368241143, 0.021492206313915848, ...","[[0.04304823942671234, 0.3141493061042169, 0.2...",-0.176139,1.478958,21.434097,17.754848,3.679249


Check that it worked with an example

In [97]:
merged_df[merged_df['filename']=='counting_103.jpg'].head(1)

Unnamed: 0,filename,caption,foil,linguistic_phenomena,model,model_shap_positive_normalized,human_map,RC_to_human,EMD_to_human,model_pred_caption,model_pred_foil,model_pred_diff
1,counting_103.jpg,A person counts money.,A person pays money.,actions,LXMERT,"[[0.10524416414966785, 0.13240805706235903, 0....","[[0.004482754295119378, 0.029023927486055416, ...",-0.464706,1.118272,0.001854,0.001183,0.000671


In [98]:
distance_metrics[distance_metrics['filename']=='counting_103.jpg'].head(1)

Unnamed: 0,filename,index,img_path,caption,foil,linguistic_phenomena,model_shap_map,n_image_variants,model,model_shap_positive_normalized,human_map,shuffled_column_human_map,EMD_to_human,RC_to_human,EMD_to_scrambled_human,RC_to_scrambled_human,EMD_to_shuffled_column_human,RC_to_shuffled_column_human
2,counting_103.jpg,2,../../VALSE_data/images/swig/counting_103.jpg,A person counts money.,A person pays money.,actions,"[[-0.0010253147265757434, 0.001289952102524694...",172,LXMERT,"[[0.10524416414966785, 0.13240805706235903, 0....","[[0.004482754295119378, 0.029023927486055416, ...","[[0.058031671761254025, 0.011923543607465443, ...",1.118272,-0.464706,0.750563,0.170588,1.144368,0.238833


In [99]:
stimuli_outputs[stimuli_outputs['filename']=='counting_103.jpg'].head(1)

Unnamed: 0,img_path,caption,foil,linguistic_phenomena,model_pred_caption,model_pred_foil,model_pred_diff,model,filename
20,../../VALSE_data/images/swig/counting_103.jpg,A person counts money.,A person pays money.,actions,0.001854,0.001183,0.000671,LXMERT,counting_103.jpg


### Export

In [100]:
merged_df.to_pickle("distances_outputs_table.pickle")

# 2. Get VALSE summary stats for RQ2

In [101]:
valse_outputs.head(1)

Unnamed: 0,index,img_path,caption,foil,linguistic_phenomena,model_pred_caption,model_pred_foil,model_pred_diff,model,filename
0,0,../../VALSE_data/images/coco2017/000000462576.jpg,Breakfast items including juice are on the table.,Breakfast items including juice are off the ta...,relations,0.056552,0.087249,-0.030697,LXMERT,000000462576.jpg


In [102]:
from collections import defaultdict
import numpy as np

model_stats_valse = defaultdict(list)
model_stats_valse['metric (VALSE wide model output)'] = ['mean','std','acc_overall','acc_existence','acc_relations','acc_actions']

for model in models:
    filtered_df = valse_outputs[valse_outputs['model']==model]

    # calculate mean and std of output for each model
    mean = np.mean(list(filtered_df['model_pred_diff']))
    std = np.std(list(filtered_df['model_pred_diff']))

    model_stats_valse[model].append(mean)
    model_stats_valse[model].append(std)

    # calculate overall accuracy of model --positive outputs over total outputs
    accuracy = len(filtered_df[filtered_df['model_pred_diff']>=0])/len(filtered_df)
    model_stats_valse[model].append(accuracy)

    # calculate accuracy for each phenomenon separately 
    for phenomenon in ['existence','relations','actions']:

        # filter to phenomenon
        sub_filtered_df = filtered_df[filtered_df['linguistic_phenomena']==phenomenon]

        # accuracy is positive outputs over total outputs
        accuracy = len(sub_filtered_df[sub_filtered_df['model_pred_diff']>=0])/len(sub_filtered_df)
        model_stats_valse[model].append(accuracy)

valse_stats_df = pd.DataFrame(model_stats_valse)

In [103]:
valse_stats_df.to_pickle("valse_statistics.pickle")

In [104]:
valse_stats_df

Unnamed: 0,metric (VALSE wide model output),LXMERT,CLIP,FLAVA,SigLip
0,mean,0.010805,0.953458,1.30002,1.02405
1,std,0.164757,1.709227,2.900839,2.333226
2,acc_overall,0.532802,0.715965,0.672355,0.667804
3,acc_existence,0.768317,0.691089,0.623762,0.69505
4,acc_relations,0.605607,0.659813,0.628037,0.605607
5,acc_actions,0.433939,0.742642,0.702567,0.680025
