### image locations outcome for datasets:
- images/coco2014 (part of VisDial with COCO in name, all of foil)
- images/coco2017
- images/v7w
- images/swig
- images/visdial/VisualDialog_val2018 (part of visdial w/o COCO in name)




In [150]:
# From VALSE github
DATA_LOCATIONS = {
            "existence": ["data/visual7w/images/",
                          'data/existence.json'],
            "plurals": ["data/plurals/test_images/",
                        'data/plurals.json'],
            "counting_hard": ["data/visual7w/images/",
                              'data/counting_hard.json'],
            "counting_small": ['data/visual7w/images/',
                               'data/counting-small-quant.json'],
            "counting_adversarial": ["data/visual7w/images/",
                                     'data/counting_adversarial.json'],
            "relations": ["data/relations/test_images/",
                          'data/relation.json'],
            "action replace": ['data/actions/images_512/',
                               'data/action-replacement.json'],
            "actant swap": ['data/actions/images_512/',
                            'data/actions/actant-swap.json'],
            "coref": ["data/coref/release_too_many_is_this_in_color/images/",
                      'data/coreference-standard.json'],
            "coref_hard": ["data/coref/release_v18/test_images/",
                           'data/coreference-hard.json'],
            "foil_it": ["data/scratch/COCO/val2014/",
                        "data/foil-it.json"],
        }

# Downloading data

In [42]:
# Create a pandas dataframe of all the files

import os
json_filenames = os.listdir("data")
json_paths = [os.path.join("data", filename) for filename in json_filenames]
json_paths

['data/counting-small-quant.json',
 'data/relations.json',
 'data/counting-adversarial.json',
 'data/existence.json',
 'data/action-replacement.json',
 'data/counting-hard.json',
 'data/plurals.json',
 'data/coreference-standard.json',
 'data/actant-swap.json',
 'data/coreference-hard.json',
 'data/foil-it.json']

# Create a dataframe with all captions, foils filenames

In [43]:
import pandas as pd
concat_df = pd.DataFrame()

# add each json to the concatenated_df dataframe
for json_path in json_paths:
    df = pd.read_json(json_path).transpose()

    # create a new column which identifies with set of challenges this particular caption-foil combo came from
    df["challenge_set"] = [os.path.basename(json_path) for i in range(len(df))]
    concat_df = pd.concat([concat_df, df])

# get rid of index
concat_df.reset_index(drop=True, inplace=True)

concat_df['dataset'] = list(map(lambda x: x.replace('coco_2017','coco2017'),list(concat_df['dataset'])))

concat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8782 entries, 0 to 8781
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   dataset                      8782 non-null   object
 1   dataset_idx                  7725 non-null   object
 2   image_file                   8782 non-null   object
 3   linguistic_phenomena         8782 non-null   object
 4   caption                      8782 non-null   object
 5   answer                       3290 non-null   object
 6   classes                      8782 non-null   object
 7   split                        3290 non-null   object
 8   original_split               8782 non-null   object
 9   classes_foil                 8782 non-null   object
 10  mturk                        8782 non-null   object
 11  foil                         8782 non-null   object
 12  challenge_set                8782 non-null   object
 13  foil_method                  1614

In [44]:
# debugginga this, why are some not fully there e.g. dataset_idx
import numpy as np

concat_df[concat_df['dataset_idx'].isnull()].tail(2) #seems ok

Unnamed: 0,dataset,dataset_idx,image_file,linguistic_phenomena,caption,answer,classes,split,original_split,classes_foil,mturk,foil,challenge_set,foil_method,grammaticality_scores,nli_scores,key,declarative_statement,declarative_statement_foils,provenance_of_foils,foiling_method,orig_caption,only_caption
7780,VisDial_v1.0,,VisualDialog_val2018_000000321392.jpg,coreference,a girl and boy sitting in the living room and ...,,no,,val,yes,"{'foil': 0, 'caption': 3, 'other': 0}",a girl and boy sitting in the living room and ...,coreference-hard.json,,,,,,,,,,a girl and boy sitting in the living room and ...
7781,VisDial_v1.0,,VisualDialog_val2018_000000366853.jpg,coreference,2 zebras with their heads down are walking alo...,,yes,,val,no,"{'foil': 1, 'caption': 2, 'other': 0}",2 zebras with their heads down are walking alo...,coreference-hard.json,,,,,,,,,,2 zebras with their heads down are walking alo...


In [45]:
# these are the datasets we need to download images from
set(concat_df.dataset)

{'FOIL dataset', 'SWiG', 'VisDial_v1.0', 'coco2017', 'visual7w'}

### Q: Is any filename duplicated in the VALSE dataset?

YES

In [46]:
all_filenames = list(concat_df.image_file)

# list duplicated ones
duplicated_filenames = {filename for filename in all_filenames if all_filenames.count(filename) > 1}

# find those rows which contian duplicates
pd.set_option('display.max_columns', None)
concat_df[concat_df['image_file'].isin(duplicated_filenames)].sort_values(by=['image_file']).head(4)


Unnamed: 0,dataset,dataset_idx,image_file,linguistic_phenomena,caption,answer,classes,split,original_split,classes_foil,mturk,foil,challenge_set,foil_method,grammaticality_scores,nli_scores,key,declarative_statement,declarative_statement_foils,provenance_of_foils,foiling_method,orig_caption,only_caption
5193,coco2017,742000,000000001584.jpg,plurals,A double decker bus rolls along a number of st...,,streets,,val,street,"{'foil': 3, 'caption': 0, 'other': 0}",A double decker bus rolls along a single street.,plurals.json,"checklist method, change NP to SG",[0.888248921679157],"{'entail': [0.012125653214752001], 'neutral': ...",,,,,,A double decker bus rolls along the streets.,
1580,coco2017,732292,000000001584.jpg,relations,"The red, double decker bus is driving past oth...",,past,,val,towards,"{'foil': 0, 'caption': 2, 'other': 1}","The red, double decker bus is driving towards ...",relations.json,"checklist spanbert prediction (max span=3, top...",[0.8537795061080561],"{'entail': [0.23726837337017], 'contradict': [...",relations:coco_2017:732292,,,,,,
5545,coco2017,577428,000000002685.jpg,plurals,A couple of people are standing in front of so...,,bottles,,val,bottle,"{'foil': 0, 'caption': 3, 'other': 0}",A couple of people are standing in front of a ...,plurals.json,"checklist method, change NP to SG",[0.8292018788494581],"{'entail': [0.011446055956184], 'neutral': [0....",,,,,,a couple of people are standing in front of so...,
5407,coco2017,572793,000000002685.jpg,plurals,A number of people line up to taste some wine.,,people,,val,person,"{'foil': 0, 'caption': 3, 'other': 0}",A single person lines up to taste some wine.,plurals.json,"checklist method, change NP to SG",[0.877675205469531],"{'entail': [0.003204374806955], 'neutral': [0....",,,,,,Many people line up to taste some wine.,


Check that all duplicated file names are from SAME dataset as the duplicate files. This means they are the same image

In [47]:
filenames_of_concern = []

for filename in duplicated_filenames:
    dataset_values = list(concat_df[concat_df["image_file"]==filename]["dataset"])
    unique_dataset_values = set(dataset_values)
    if len(unique_dataset_values)>1:
        print("MULTIPLE DATASETS for same file!", filename, dataset_values)
        filenames_of_concern.append(filename)
    else:
        pass
        #print("NOT MULTIPLE", dataset_values)

MULTIPLE DATASETS for same file! COCO_val2014_000000175151.jpg ['VisDial_v1.0', 'FOIL dataset']
MULTIPLE DATASETS for same file! COCO_val2014_000000287959.jpg ['VisDial_v1.0', 'FOIL dataset']
MULTIPLE DATASETS for same file! COCO_val2014_000000386257.jpg ['VisDial_v1.0', 'FOIL dataset']
MULTIPLE DATASETS for same file! COCO_val2014_000000255950.jpg ['VisDial_v1.0', 'FOIL dataset']
MULTIPLE DATASETS for same file! COCO_val2014_000000273321.jpg ['VisDial_v1.0', 'FOIL dataset']
MULTIPLE DATASETS for same file! COCO_val2014_000000581317.jpg ['VisDial_v1.0', 'FOIL dataset']


In [48]:
filenames_of_concern

['COCO_val2014_000000175151.jpg',
 'COCO_val2014_000000287959.jpg',
 'COCO_val2014_000000386257.jpg',
 'COCO_val2014_000000255950.jpg',
 'COCO_val2014_000000273321.jpg',
 'COCO_val2014_000000581317.jpg']

We found some files which seem to be in "multiple" datasets. We are going to check now (and assume) that they are indeed the same picture

In [49]:
concat_df[concat_df["image_file"].isin(filenames_of_concern)].sort_values(by="image_file")

Unnamed: 0,dataset,dataset_idx,image_file,linguistic_phenomena,caption,answer,classes,split,original_split,classes_foil,mturk,foil,challenge_set,foil_method,grammaticality_scores,nli_scores,key,declarative_statement,declarative_statement_foils,provenance_of_foils,foiling_method,orig_caption,only_caption
6357,VisDial_v1.0,,COCO_val2014_000000175151.jpg,coreference,there is a bus moving down a street. is this a...,,no,,train,yes,"{'foil': 0, 'caption': 3, 'other': 0}",there is a bus moving down a street. is this a...,coreference-standard.json,,,,,,,,,,there is a bus moving down a street
7999,FOIL dataset,1249407.0,COCO_val2014_000000175151.jpg,noun phrases,a city bus is riding down the empty street.,,bus,,test,airplane,"{'foil': 0, 'caption': 3, 'other': 0}",a city airplane is riding down the empty street.,foil-it.json,,,,,,,,,,
5794,VisDial_v1.0,,COCO_val2014_000000255950.jpg,coreference,looking down on a stony surface shows a bowl w...,,no,,train,yes,"{'foil': 1, 'caption': 2, 'other': 0}",looking down on a stony surface shows a bowl w...,coreference-standard.json,,,,,,,,,,looking down on a stony surface shows a bowl w...
8374,FOIL dataset,1106662.0,COCO_val2014_000000255950.jpg,noun phrases,cement ledge with orange in bowl and red plast...,,orange,,test,cake,"{'foil': 0, 'caption': 3, 'other': 0}",cement ledge with cake in bowl and red plastic...,foil-it.json,,,,,,,,,,
5819,VisDial_v1.0,,COCO_val2014_000000273321.jpg,coreference,guy jumps up to catch the frisbee in the gym. ...,,no,,train,yes,"{'foil': 1, 'caption': 2, 'other': 0}",guy jumps up to catch the frisbee in the gym. ...,coreference-standard.json,,,,,,,,,,guy jumps up to catch the frisbee in the gym
8588,FOIL dataset,1040688.0,COCO_val2014_000000273321.jpg,noun phrases,a person off to the side is wearing an orange ...,,orange,,test,pizza,"{'foil': 0, 'caption': 3, 'other': 0}",a person off to the side is wearing an pizza t...,foil-it.json,,,,,,,,,,
5909,VisDial_v1.0,,COCO_val2014_000000287959.jpg,coreference,this cross section of a sandwich is still wrap...,,yes,,train,no,"{'foil': 0, 'caption': 2, 'other': 1}",this cross section of a sandwich is still wrap...,coreference-standard.json,,,,,,,,,,this cross section of a sandwich is still wrapped
8666,FOIL dataset,1162397.0,COCO_val2014_000000287959.jpg,noun phrases,the sandwich had been cut and was ready to eat.,,sandwich,,test,pizza,"{'foil': 0, 'caption': 3, 'other': 0}",the pizza had been cut and was ready to eat.,foil-it.json,,,,,,,,,,
6555,VisDial_v1.0,,COCO_val2014_000000386257.jpg,coreference,large circular shaped clock tower in black and...,,no,,train,yes,"{'foil': 0, 'caption': 3, 'other': 0}",large circular shaped clock tower in black and...,coreference-standard.json,,,,,,,,,,large circular shaped clock tower in black and...
8530,FOIL dataset,1209901.0,COCO_val2014_000000386257.jpg,noun phrases,a clock is sitting below an american flag.,,clock,,test,scissors,"{'foil': 0, 'caption': 3, 'other': 0}",a scissors is sitting below an american flag.,foil-it.json,,,,,,,,,,


### Download challenges

Check which dataset each challenge draws from:

In [50]:
#FOIL

challenge_names = set(concat_df['challenge_set'])

for challenge_name in challenge_names:
    print(challenge_name, "datasets:", set(concat_df[concat_df['challenge_set']==challenge_name]['dataset']) )

action-replacement.json datasets: {'SWiG'}
counting-hard.json datasets: {'visual7w'}
coreference-standard.json datasets: {'VisDial_v1.0'}
counting-adversarial.json datasets: {'visual7w'}
relations.json datasets: {'coco2017'}
existence.json datasets: {'visual7w'}
foil-it.json datasets: {'FOIL dataset'}
coreference-hard.json datasets: {'VisDial_v1.0'}
plurals.json datasets: {'coco2017'}
actant-swap.json datasets: {'SWiG'}
counting-small-quant.json datasets: {'visual7w'}


# Download files and map filenames to downloaded file path in dictionary

In [83]:
import json

filename2path = {}
set(concat_df.dataset)



{'FOIL dataset', 'SWiG', 'VisDial_v1.0', 'coco2017', 'visual7w'}

## FOIL IT

Since all files in this are actually included in COCO 2014, **we will download the FOIL IT files along with the VisDial files and place them in "images/coco2014"**

See VisDial section for more details. This is all done in COLAB.

Here we just create a list of desired filenames for FOIL IT data points.

In [82]:
foil_filenames = list(concat_df[concat_df['challenge_set']=='foil-it.json']['image_file'])

# export list of desired files
file = open('dataset_info/FOIL_coco2014_filenames.txt','w')
for filename in foil_filenames:
	file.write(filename+"\n")
file.close()

## SWiG

Outcome: Directly download all relevant files through Google Colab

First see what the filenames for this dataset are like in the VALSE dataset:

In [54]:
concat_df[concat_df['dataset']=='SWiG']["image_file"]

2904    exercising_255.jpg
2905       turning_172.jpg
2906     preaching_116.jpg
2907    displaying_251.jpg
2908     unveiling_139.jpg
               ...        
7636       driving_253.jpg
7637        cooking_66.jpg
7638       sitting_228.jpg
7639       gnawing_178.jpg
7640       rotting_182.jpg
Name: image_file, Length: 1821, dtype: object

Now compare to the SWiG json:

In [55]:
json_file_path = "dataset_info/swig-test.json"
with open(json_file_path, "r") as json_file:
    # Load the JSON data
    data = json.load(json_file) 
    print(data["rotting_182.jpg"])


{'bb': {'place': [-1, -1, -1, -1], 'container': [1, 106, 973, 509], 'agent': [95, 85, 875, 493]}, 'height': 512, 'width': 975, 'verb': 'rotting', 'frames': [{'container': 'n04379243', 'place': 'n04105893', 'agent': 'n00021265'}, {'container': 'n08266235', 'place': 'n04105893', 'agent': 'n07697100'}, {'container': '', 'place': 'n03179701', 'agent': 'n07697100'}]}


We can see there are no URLs in the JSON. Instead, we will have to load the entire dataset and copy the relevant files.

I am going to try to do this in Colab because it should be faster there. First we export a list of the relevant filenames and then we will download only those files from the ZIP onto the desktop, running it through Colab.

Colab link: https://colab.research.google.com/drive/1PtJMQxhuLMQvQUm5thEGazcte0FtUZgF#scrollTo=NzLcCT2rvpR_

In [56]:
# export list of desired files
file = open('dataset_info/desired-swig-filenames.txt','w')
for filename in list(concat_df[concat_df['dataset']=='SWiG']["image_file"]):
	file.write(filename+"\n")
file.close()



In [57]:
# Now extract the downloaded zip file to the relevant directory -- directly as files which have same name as they have in VALSE dataset

import zipfile
zip_file_path = 'downloads/swig-download.zip'
target_directory = 'images/swig'

os.makedirs(target_directory, exist_ok = True)

# Extract the ZIP file to the target directory
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(target_directory)

print(f"Extracted '{zip_file_path}' to '{target_directory}'")

Extracted 'downloads/swig-download.zip' to 'images/swig'


Now assign relevant entries in the file2path dictionary to the extracted paths

In [59]:
#filename2path

## VisDial v1

NOTE: This dataset| has some overlap with COCO 2014 (but **NOT** COCO 2017)

In [60]:
visdial_images = list(concat_df[concat_df['dataset']=='VisDial_v1.0'].image_file)
visdial_images

['COCO_train2014_000000254009.jpg',
 'COCO_train2014_000000327694.jpg',
 'COCO_train2014_000000502884.jpg',
 'COCO_train2014_000000358965.jpg',
 'COCO_val2014_000000410632.jpg',
 'COCO_train2014_000000072382.jpg',
 'COCO_train2014_000000390183.jpg',
 'COCO_val2014_000000450695.jpg',
 'COCO_val2014_000000536369.jpg',
 'COCO_train2014_000000089540.jpg',
 'COCO_train2014_000000075254.jpg',
 'COCO_train2014_000000499451.jpg',
 'COCO_val2014_000000133386.jpg',
 'COCO_train2014_000000147371.jpg',
 'COCO_val2014_000000030548.jpg',
 'COCO_train2014_000000130295.jpg',
 'COCO_train2014_000000035211.jpg',
 'COCO_train2014_000000414577.jpg',
 'COCO_train2014_000000153938.jpg',
 'COCO_val2014_000000136941.jpg',
 'COCO_val2014_000000307668.jpg',
 'COCO_train2014_000000232279.jpg',
 'COCO_train2014_000000467537.jpg',
 'COCO_train2014_000000128297.jpg',
 'COCO_train2014_000000286439.jpg',
 'COCO_train2014_000000562192.jpg',
 'COCO_train2014_000000279616.jpg',
 'COCO_train2014_000000507438.jpg',
 'COCO

In [61]:
# there are some missing pieces here, 
# but I am going to download the ones I have in the VisualDialog val 2018 dataset.
# Any missing ones will be looked for separately later :)

### Get the VisDial validation set

In [88]:
destination_folder = "images/visdial"

import zipfile

# open zip file 
with zipfile.ZipFile("downloads/VisualDialog_val2018.zip", 'r') as zip_file:

    # this is all the filenames in the VisDial 2018 val set
    zip_basenames = [os.path.basename(path) for path in zip_file.namelist()]

    # loop through desired filenames
    for desired_filename in visdial_images:

      # check: is this filename in the zip?
      if desired_filename in zip_basenames:
        # if so, extract it into the destination folder
        zip_file.extract("VisualDialog_val2018/"+desired_filename, destination_folder)
        # this is the path of the new file
        new_path = os.path.join(destination_folder, "VisualDialog_val2018", desired_filename)
        filename2path[desired_filename] = new_path
        print(f"Copied {desired_filename} to {new_path}")
        
      else:
        print(f"Could not find {desired_filename}")




Could not find COCO_train2014_000000254009.jpg
Could not find COCO_train2014_000000327694.jpg
Could not find COCO_train2014_000000502884.jpg
Could not find COCO_train2014_000000358965.jpg
Could not find COCO_val2014_000000410632.jpg
Could not find COCO_train2014_000000072382.jpg
Could not find COCO_train2014_000000390183.jpg
Could not find COCO_val2014_000000450695.jpg
Could not find COCO_val2014_000000536369.jpg
Could not find COCO_train2014_000000089540.jpg
Could not find COCO_train2014_000000075254.jpg
Could not find COCO_train2014_000000499451.jpg
Could not find COCO_val2014_000000133386.jpg
Could not find COCO_train2014_000000147371.jpg
Could not find COCO_val2014_000000030548.jpg
Could not find COCO_train2014_000000130295.jpg
Could not find COCO_train2014_000000035211.jpg
Could not find COCO_train2014_000000414577.jpg
Could not find COCO_train2014_000000153938.jpg
Could not find COCO_val2014_000000136941.jpg
Could not find COCO_val2014_000000307668.jpg
Could not find COCO_train20

In [63]:
filename2path["VisualDialog_val2018_000000366853.jpg"]

'images/visdial/VisualDialog_val2018/VisualDialog_val2018_000000366853.jpg'

### Get the VisDial files from the COCO 2014 train and val sets

Download with COLAB and then manually move all files to following folder: images/coco2014 (FOIL set is also downloaded here)

(In the downloaded folder there will be two subfolders, but **throw all image files together into coco2014 as a flat directory regardless!**)

In [64]:
# get list of VisDial images which are 
coco2014_filenames = [filename for filename in visdial_images if "COCO" in filename]
#coco2014_filenames

In [65]:
# export list of desired files
file = open('dataset_info/visdial_coco2014_filenames.txt','w')
for filename in coco2014_filenames:
	file.write(filename+"\n")
file.close()

In [66]:
len(coco2014_filenames)

916

In [67]:
len(set(coco2014_filenames))

916

Above code confirms there are no duplicates in the list of filenames

## COCO 2017

Download with COLAB and manually move all image files to **following folder: "images/coco2017"**

In [68]:
coco2017_filenames = list(concat_df[concat_df["dataset"]=="coco2017"].image_file)

In [69]:
len(coco2017_filenames)

1614

In [70]:
len(set(coco2017_filenames))

1351

This means some filenames are duplicated but that is not an issue. Just need to only download each file once

In [40]:
# export list of desired files
file = open('dataset_info/coco2017_filenames.txt','w')
for filename in set(coco2017_filenames): # loop through SET to avoid duplication
	file.write(filename+"\n")
file.close()

## Visual7w

Download with COLAB and manually move all image files to **"images/v7w"** (as flat directory)

In [135]:
visual7w_filenames = list(concat_df[concat_df["dataset"]=="visual7w"].image_file)


In [136]:
len(visual7w_filenames)

3290

In [137]:
len(set(visual7w_filenames))

2246

In [138]:
# export list of desired files
file = open('dataset_info/v7w_filenames.txt','w')
for filename in set(visual7w_filenames): # loop through SET to avoid duplication
	file.write(filename+"\n")
file.close()

# Map files to paths in dictionary

We do this for the files we downloaded through Colab

### Map COCO 2017 filenames to downloaded file paths in dictionary

In [87]:
for filename in coco2017_filenames:
    file_path = os.path.join("images/coco2017", filename)
    if os.path.exists(file_path):
        #print(f"Put {file_path} in dictionary for {filename}")
        filename2path[filename] = file_path
    else:
        print(f"{file_path} does not exist!")


In [90]:
len(filename2path)

1492

### Map SWiG filenames

In [124]:
for filename in list(concat_df[concat_df['dataset']=='SWiG']["image_file"]):
    file_path = os.path.join('images/swig', filename)
    if os.path.exists(file_path):
        #print(f"Put {file_path} in dictionary for {filename}")
        filename2path[filename] = file_path
    else:
        print(f"{file_path} does not exist!")

In [125]:
len(filename2path)

2665

### Map v7w

In [141]:
for filename in list(concat_df[concat_df['dataset']=='visual7w']["image_file"]):
    file_path = os.path.join('images/v7w', filename)
    if os.path.exists(file_path):
        #print(f"Put {file_path} in dictionary for {filename}")
        filename2path[filename] = file_path
    else:
        print(f"{file_path} does not exist!")

### Map COCO 2014 filenames (VisDial subset + FOIL IT)

In [93]:
# get all vis dialogue file names
visdial_files = list(concat_df[concat_df['dataset']=='VisDial_v1.0']["image_file"])

In [119]:
# only keep those which contain the word "COCO"
visdial_coco_files = [filename for filename in visdial_files if "COCO" in filename]

In [120]:
#now FOIL dataset
foil_files = list(concat_df[concat_df['dataset']=="FOIL dataset"]["image_file"])

In [118]:
#all should be COCO 2014. let's check:
for filename in foil_files:
    if not "COCO" in filename or "2014_" not in filename:
        print(filename)

In [123]:
coco2014_filenames = visdial_coco_files+foil_files
len(coco2014_filenames)

1916

In [127]:
for filename in coco2014_filenames:
    file_path = os.path.join("images/coco2014", filename)
    if os.path.exists(file_path):
        #print(f"Put {file_path} in dictionary for {filename}")
        filename2path[filename] = file_path
    else:
        print(f"{file_path} does not exist!")

4567

In [142]:
# check which image files are still missing and from which dataset they are
set(concat_df[~concat_df['image_file'].isin(list(filename2path.keys()))].dataset)

set()

In [143]:
len(filename2path)

6813

# Add new column to dataset with the IMAGE PATH

In [145]:
file_paths = []
for filename in concat_df.image_file:
    file_paths.append(filename2path[filename] )
concat_df["local_img_path"]=file_paths

In [146]:
concat_df

Unnamed: 0,dataset,dataset_idx,image_file,linguistic_phenomena,caption,answer,classes,split,original_split,classes_foil,mturk,foil,challenge_set,foil_method,grammaticality_scores,nli_scores,key,declarative_statement,declarative_statement_foils,provenance_of_foils,foiling_method,orig_caption,only_caption,local_img_path
0,visual7w,2349294,v7w_2349294.jpg,counting,Cars are allowed to park for exactly 2 hours f...,2,2,test,test,1,"{'foil': 0, 'caption': 3, 'other': 0}",Cars are allowed to park for exactly 1 hour fr...,counting-small-quant.json,,,,,,,,,,,images/v7w/v7w_2349294.jpg
1,visual7w,2344761,v7w_2344761.jpg,counting,There are exactly 0 people in the photo.,0,0,test,test,2,"{'foil': 0, 'caption': 3, 'other': 0}",There are exactly 2 people in the photo.,counting-small-quant.json,,,,,,,,,,,images/v7w/v7w_2344761.jpg
2,visual7w,2382764,v7w_2382764.jpg,counting,There are exactly 3 of the bus' wheels visible.,3,3,test,test,2,"{'foil': 2, 'caption': 1, 'other': 0}",There are exactly 2 of the bus' wheels visible.,counting-small-quant.json,,,,,,,,,,,images/v7w/v7w_2382764.jpg
3,visual7w,2362253,v7w_2362253.jpg,counting,There is exactly 1 plane.,1,1,test,test,2,"{'foil': 0, 'caption': 2, 'other': 1}",There are exactly 2 planes.,counting-small-quant.json,,,,,,,,,,,images/v7w/v7w_2362253.jpg
4,visual7w,2394354,v7w_2394354.jpg,counting,There are exactly 2 birds.,2,2,test,test,3,"{'foil': 0, 'caption': 3, 'other': 0}",There are exactly 3 birds.,counting-small-quant.json,,,,,,,,,,,images/v7w/v7w_2394354.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8777,FOIL dataset,1243143,COCO_val2014_000000428554.jpg,noun phrases,a man in a leans forward and ties a bow tie.,,tie,,test,suitcase,"{'foil': 0, 'caption': 3, 'other': 0}",a man in a leans forward and ties a bow suitcase.,foil-it.json,,,,,,,,,,,images/coco2014/COCO_val2014_000000428554.jpg
8778,FOIL dataset,1160521,COCO_val2014_000000370478.jpg,noun phrases,corner of a bedroom with a laptop and luggage ...,,laptop,,test,tv,"{'foil': 0, 'caption': 3, 'other': 0}",corner of a bedroom with a tv and luggage on a...,foil-it.json,,,,,,,,,,,images/coco2014/COCO_val2014_000000370478.jpg
8779,FOIL dataset,1043134,COCO_val2014_000000446881.jpg,noun phrases,a living room with a couch tables a table and ...,,table,,test,chair,"{'foil': 0, 'caption': 0, 'other': 3}",a living room with a couch chairs a chair and ...,foil-it.json,,,,,,,,,,,images/coco2014/COCO_val2014_000000446881.jpg
8780,FOIL dataset,1143235,COCO_val2014_000000567863.jpg,noun phrases,a tall giraffe standing next to a baby giraffe.,,giraffe,,test,zebra,"{'foil': 0, 'caption': 3, 'other': 0}",a tall giraffe standing next to a baby zebra.,foil-it.json,,,,,,,,,,,images/coco2014/COCO_val2014_000000567863.jpg


### Export dataframe

In [148]:
concat_df.to_csv("valse_challenges_with_img_paths.csv", index = False)