In [1]:
import os
import pandas as pd
import pickle

The original data behind this data processing pipeline is backuped up on Google Drive: https://drive.google.com/drive/folders/1FY8oO7P_BEuL8dkFaGep8dHq3yi3KgZi

### Import all .pickle files for each user and create one huge dataframe

This dataframe contains every user submitted answer and mask. One row corresponds to one user-stimulus combination

In [2]:
folder = "study_raw_output"
files = os.listdir(folder)
files = [file for file in files if ".pickle" in file ]

In [3]:
files

['20240209102845833682_stimuli99.pickle',
 '20240202153131339291_stimuli1.pickle',
 '20240203150500009862_stimuli2.pickle',
 '20240204142931678609_stimuli3.pickle',
 '20240203150423202294_stimuli2.pickle',
 '20240204143836677916_stimuli3.pickle',
 '20240203142645126357_stimuli2.pickle',
 '20240202153133853542_stimuli1.pickle',
 '20240209135843972721_stimuli99.pickle',
 '20240203150425019658_stimuli2.pickle',
 '20240201125402294500_stimuli1.pickle',
 '20240201125949942734_stimuli1.pickle',
 '20240204142942472359_stimuli3.pickle',
 '20240203162227680482_stimuli3.pickle',
 '20240203083058032730_stimuli1.pickle',
 '20240203150741846867_stimuli2.pickle',
 '20240204143913560556_stimuli3.pickle']

In [4]:
dfs = [pickle.load(open(os.path.join(folder, file), "rb")).sort_values(by='filename') for file in files]
final_df = pd.concat(dfs)
final_df["nr_clicks"] = [len(points) for points in final_df['points_clicked']]
final_df

Unnamed: 0,instance_id,mask,points_clicked,filename,answer,time_to_submit,img_path,prolific_session_id,nr_clicks
6,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(177.5, 226), (65.5, 275), (67.5, 283), (367....",000000057027.jpg,caption,76.445115,app/img/000000057027_blur_0.jpg,65c5fe51fd295062720c7d34,6
5,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",[],000000089880.jpg,caption,22.898880,app/img/000000089880_blur_0.jpg,65c5fe51fd295062720c7d34,0
1,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(92.5, 190), (158.5, 242), (201.5, 279)]",000000121506.jpg,caption,56.860392,app/img/000000121506_blur_0.jpg,65c5fe51fd295062720c7d34,3
8,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(241.5, 311), (335.5, 328), (322.5, 334), (34...",000000121586.jpg,foil,77.049413,app/img/000000121586_blur_0.jpg,65c5fe51fd295062720c7d34,7
9,20240209102845833682,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(335.5, 221), (271.5, 147), (138.5, 195), (77...",000000164885.jpg,Cannot answer,110.092230,app/img/000000164885_blur_0.jpg,65c5fe51fd295062720c7d34,8
...,...,...,...,...,...,...,...,...,...
11,20240204143913560556,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(124, 307), (54, 99), (69, 174)]",v7w_2367084.jpg,caption,23.476451,app/img/v7w_2367084_blur_0.jpg,65bf9f4014e422cb67a1ad0e,3
25,20240204143913560556,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(206, 268), (91, 95), (317, 92), (201, 83), (...",v7w_2373121.jpg,caption,47.739632,app/img/v7w_2373121_blur_0.jpg,65bf9f4014e422cb67a1ad0e,7
7,20240204143913560556,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(178, 83), (160, 95), (252, 102), (322, 71), ...",v7w_2390850.jpg,caption,91.203454,app/img/v7w_2390850_blur_0.jpg,65bf9f4014e422cb67a1ad0e,7
0,20240204143913560556,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(232, 243), (231, 236)]",v7w_2391321.jpg,caption,31.462687,app/img/v7w_2391321_blur_0.jpg,65bf9f4014e422cb67a1ad0e,2


### Add more data like whether it was approved/rejected and the caption and foil, linguistic phenomenon

In [5]:
# load the full stimuli excel file
stimuli_df = pd.read_excel("../../data_prep/269_stimuli_edited_labeled.xlsx")
stimuli_df['filename'] = [os.path.basename(path) for path in stimuli_df['img_path']]
stimuli_df.head(1)

Unnamed: 0,Selection,img_path,caption,foil,linguistic_phenomena,Approval,Notes,clip_pred_caption,clip_pred_foil,clip_pred_diff,performance_group,"% can't tell (google form, existence only)","% can tell (google form, existence only)","% unsure (google form, existence only)",Caption edited,Foil edited,filename
0,First set (Jan 16),../VALSE_data/images/coco2017/000000001584.jpg,"The red, double decker bus is driving past oth...","The red, double decker bus is driving towards ...",relations,Approve,,30.26965,30.40103,-0.13138,low_perf,,,,,,000000001584.jpg


Merge operation

In [6]:
cols_to_merge = ["caption", "foil", "linguistic_phenomena", "Approval", "clip_pred_diff", "img_path"]
stimuli_subset = stimuli_df[['filename']+cols_to_merge]
stimuli_subset.head(1)

Unnamed: 0,filename,caption,foil,linguistic_phenomena,Approval,clip_pred_diff,img_path
0,000000001584.jpg,"The red, double decker bus is driving past oth...","The red, double decker bus is driving towards ...",relations,Approve,-0.13138,../VALSE_data/images/coco2017/000000001584.jpg


In [7]:
final_df = final_df.drop('img_path', axis = 1)
merged_df = final_df.merge(stimuli_subset, on='filename', how = 'left')
merged_df[merged_df['filename']=="000000005529.jpg"]

Unnamed: 0,instance_id,mask,points_clicked,filename,answer,time_to_submit,prolific_session_id,nr_clicks,caption,foil,linguistic_phenomena,Approval,clip_pred_diff,img_path
18,20240202153131339291,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...",[],000000005529.jpg,caption,17.695961,65bd0acc6b7a578310acc40b,0,A person on skis is skiing down a snowy hill.,A person on skis is skiing up a snowy hill.,relations,Approve,0.083982,../VALSE_data/images/coco2017/000000005529.jpg
222,20240202153133853542,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(180, 246), (187, 201), (106, 258), (182, 107)]",000000005529.jpg,caption,174.211705,65bd0acd2c3df3a5127f29b9,4,A person on skis is skiing down a snowy hill.,A person on skis is skiing up a snowy hill.,relations,Approve,0.083982,../VALSE_data/images/coco2017/000000005529.jpg
308,20240201125402294500,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(137, 147.3333282470703), (186, 322.333328247...",000000005529.jpg,caption,15.327968,65bb945f8c0f55f083ec04ce,2,A person on skis is skiing down a snowy hill.,A person on skis is skiing up a snowy hill.,relations,Approve,0.083982,../VALSE_data/images/coco2017/000000005529.jpg
342,20240201125949942734,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(233.5, 101), (173.5, 101)]",000000005529.jpg,caption,24.951499,65bb956eea9ea39b17c1cee2,2,A person on skis is skiing down a snowy hill.,A person on skis is skiing up a snowy hill.,relations,Approve,0.083982,../VALSE_data/images/coco2017/000000005529.jpg
444,20240203083058032730,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[(221, 184), (206, 251), (234, 227)]",000000005529.jpg,caption,33.625626,65bdf9bb8c2782e981fe9320,3,A person on skis is skiing down a snowy hill.,A person on skis is skiing up a snowy hill.,relations,Approve,0.083982,../VALSE_data/images/coco2017/000000005529.jpg


In [8]:
len(merged_df)

546

It should contain 556 rows i.e. 15 users times 34 stimuli for each + 2 users (supplemental collection) with 18 stimuli each

### Export the combined dataframe

In [9]:
merged_df.to_pickle("1_merged_prolific_data.pickle")