# Import all 15 original + 2 additional (stimuli99) user submissions and generate a final dataframe of validated attention maps

In [1]:
import os
import pandas as pd
from collections import Counter

df = pd.read_pickle("1_merged_prolific_data.pickle")
df.head(2)

Unnamed: 0,instance_id,mask,points_clicked,filename,answer,time_to_submit,prolific_session_id,nr_clicks,caption,foil,linguistic_phenomena,Approval,clip_pred_diff,img_path
0,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(177.5, 226), (65.5, 275), (67.5, 283), (367....",000000057027.jpg,caption,76.445115,65c5fe51fd295062720c7d34,6,A baby elephant walks with an older elephant,A baby elephant walks past an older elephant,relations,Approve,0.524429,../VALSE_data/images/coco2017/000000057027.jpg
1,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",[],000000089880.jpg,caption,22.89888,65c5fe51fd295062720c7d34,0,Two dogs are playing together on the grass,Two dogs are playing together under the grass,relations,Approve,0.743427,../VALSE_data/images/coco2017/000000089880.jpg


### Also create a version without the training stimulus

In [2]:
df_without_train_stim = df[df['filename'] != 'v7w_2391321.jpg'] # without the cow
len(df_without_train_stim)

529

In [3]:
len(set(df_without_train_stim.filename))

99

### Validation of user-stimulus interactions

1. Filter to only the rows with clicked points


In [4]:
filter_df = df_without_train_stim[df_without_train_stim['points_clicked'].astype(bool)]
len(filter_df)

414

2. Filter those to only rows with answer == caption

In [5]:
filter_df = filter_df[filter_df['answer']=="caption"]
len(filter_df)

357

In [6]:
len(set(filter_df.filename)) # number of filenames that have at least one validated submission

98

3. Run stats on this to see how many are left for each file

In [7]:
filename_counts = filter_df.groupby(['filename']).size()
print("The following lists how many filenames have each count of submissions that are correct and have points clicked")
filename_counts.value_counts() # this sums to 98 but that is because one file evidently has NO cases where correct caption + clicks

The following lists how many filenames have each count of submissions that are correct and have points clicked


4    38
3    37
5    17
2     3
1     3
Name: count, dtype: int64

In [8]:
len(filter_df.groupby(['filename']))

98

In [9]:
# one file is MISSING -- which one?

for filename in set(df_without_train_stim.filename):
    if filename not in set(filter_df.filename):
        missing = filename
missing

'000000389684.jpg'

4. Generate list of filenames which: have 3+ correct answers with points clicked

In [10]:
filtered_filenames = filename_counts[filename_counts >= 3].index
print(f"After filtering the dataframe we found that {len(filtered_filenames)} out of 99 stimuli had at least 3/5 answers that were both correct and contained points")

After filtering the dataframe we found that 92 out of 99 stimuli had at least 3/5 answers that were both correct and contained points


5. Generate dataframe containing only filenames **which have 3+ validated submissions**

In [11]:
filter_df = filter_df[filter_df['filename'].isin(filtered_filenames)]
filter_df.head(1)

Unnamed: 0,instance_id,mask,points_clicked,filename,answer,time_to_submit,prolific_session_id,nr_clicks,caption,foil,linguistic_phenomena,Approval,clip_pred_diff,img_path
0,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(177.5, 226), (65.5, 275), (67.5, 283), (367....",000000057027.jpg,caption,76.445115,65c5fe51fd295062720c7d34,6,A baby elephant walks with an older elephant,A baby elephant walks past an older elephant,relations,Approve,0.524429,../VALSE_data/images/coco2017/000000057027.jpg


### How many stimuli have 3+ validated submissions?

Out of 99 original stimuli

In [12]:
print("Number of stimuli for which there exist 3+ submissions with 1) correct caption and 2) at least one clic:", len(set(filter_df.filename)))

Number of stimuli for which there exist 3+ submissions with 1) correct caption and 2) at least one clic: 92


Some other stats

In [13]:
print("Value counts for 'answer' in the final dataframe:\n", filter_df.answer.value_counts())
print("Minimum number of points clicked in final dataframe:", min(len(points) for points in filter_df['points_clicked']))

Value counts for 'answer' in the final dataframe:
 answer
caption    348
Name: count, dtype: int64
Minimum number of points clicked in final dataframe: 1


### Export final dataframe - submissions to keep

In [14]:
filter_df.to_pickle("2_merged_prolific_data_filtered.pickle")