In [5]:
# read in json files from annotation/data/coco/coco_small/

import json
import os
import random

with open('annotation/data/coco/full/coco_karpathy_train.json', 'r') as f:
    train_data = json.load(f)

with open('annotation/data/coco/full/coco_karpathy_val.json', 'r') as f:
    val_data = json.load(f)

with open('annotation/data/coco/full/coco_karpathy_test.json', 'r') as f:
    test_data = json.load(f)

In [6]:
train_images = set([item['image'] for item in train_data])
val_images = set([item['image'] for item in val_data])
test_images = set([item['image'] for item in test_data])

# also find all the images in the directories of ./coco/ in format e.g. train2014/COCO_train2014_000000401251.jpg
all_images = []
for split in ['train2014', 'val2014', 'test2014']:
    split_dir = os.path.join('./coco/', split)
    all_images.extend([os.path.join(split, f) for f in os.listdir(split_dir)])
all_images = set(all_images)

print('all_images', len(all_images))
print('train_images', len(train_images))
print('val_images', len(val_images))
print('test_images', len(test_images))


all_images 164062
train_images 113287
val_images 5000
test_images 5000


In [7]:
# iterate through train_images and check if they are in all_images
missing_train_images = train_images - all_images
print(len(missing_train_images))

missing_val_images = val_images - all_images
print(len(missing_val_images))

missing_test_images = test_images - all_images
print(len(missing_test_images))


0
0
0


In [17]:
sample_ratio = 0.01
# sample the train, val, test images from train_data, val_data, test_data
sample_train_data = random.sample(train_data, int(len(train_data) * sample_ratio))
sample_val_data = random.sample(val_data, int(len(val_data) * sample_ratio))
sample_test_data = random.sample(test_data, int(len(test_data) * sample_ratio))
len(sample_train_data)

5667

In [19]:
len(train_data)

566747

In [32]:
# find the duplicates in sample_train_data that have the same image path
image_data = {}
for image in train_images:
    image_data[image] = []
for item in train_data:
    image_data[item['image']].append(item)

for image, data in image_data.items():
    if len(data) > 1:
        print(image)
        break



val2014/COCO_val2014_000000068352.jpg


In [36]:
# create a new json file with the sample data, including only one of the duplicates
cleaned_train_data = []
for image, data in image_data.items():
    cleaned_train_data.append(data[0])

with open('annotation/data/coco/full/coco_karpathy_train_cleaned.json', 'w') as f:
    json.dump(cleaned_train_data, f)

In [38]:
with open('annotation/data/coco/full/coco_karpathy_train_cleaned.json', 'r') as f:
    new_train_data = json.load(f)
len(new_train_data)

113287

In [1]:
import json

def deduplicate_jsonl(input_file, output_file):
    # Keep track of seen clip_names
    seen_clips = set()
    
    # Store unique entries
    unique_entries = []
    
    # Read input file
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                entry = json.loads(line.strip())
                clip_name = entry['clip_name']
                
                # Only keep first occurrence of each clip_name
                if clip_name not in seen_clips:
                    seen_clips.add(clip_name)
                    unique_entries.append(line)
            except json.JSONDecodeError:
                # Skip lines that aren't valid JSON
                continue
    
    # Write unique entries to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(unique_entries)
    
    print(f"Original entries: {len(seen_clips)}")
    print(f"Unique entries: {len(unique_entries)}")

# Use the function
input_file = "annotation/data/train_msrvtt_ret/train.jsonl"
output_file = "annotation/data/train_msrvtt_ret/train_deduped.jsonl"
deduplicate_jsonl(input_file, output_file)

Original entries: 7010
Unique entries: 7010
