In [1]:
import os
import sys
import json

import numpy as np

import pandas as pd

## Load and process data 

In [2]:
# Load raw data from disk
with open('/home/david/datasets/coco/val2017/annotations/instances_val2017.json', 'r') as fp:
    val_instances = json.load(fp)

In [3]:
# Load annotation info into dataframe
# box coordinates [x,y,width,height] are measured from the top left image corner and are 0-indexed
df_ann = (
    pd.DataFrame(val_instances['annotations'])
    .drop(columns=['segmentation', 'area'])
    .rename(columns={'id': 'instance_id'})
)
df_ann['bbox_x'] = df_ann['bbox'].str[0]
df_ann['bbox_y'] = df_ann['bbox'].str[1]
df_ann['bbox_width'] = df_ann['bbox'].str[2]
df_ann['bbox_height'] = df_ann['bbox'].str[3]

# Load category info into dataframe
df_cat = (
    pd.DataFrame(val_instances['categories'])
    .drop(columns=['supercategory'])
    .rename(columns={'id': 'category_id', 'name': 'category_name'})
)

# Combine all info into single dataframe
df_comb = pd.merge(df_ann, df_cat, on='category_id', how='left')

## Create datasets 

In [4]:
# Choose categories to consider, implicitly indicating which should have missing instances
dataset1_cats = {'person', 'car', 'book', 'cup', 'dining table'}
dataset2_cats = {'person', 'chair', 'bottle', 'cup', 'dining table'}
dataset3_cats = dataset1_cats | dataset2_cats

# Assign images to differnet halves of dataset with missing annotations
np.random.seed(0)
image_ids = df_comb['image_id'].unique()
image_id_dataset_mask = np.random.randint(0, 2, len(image_ids)).astype(bool)
image_ids_dataset1 = set(image_ids[image_id_dataset_mask].tolist())
image_ids_dataset2 = set(image_ids[~image_id_dataset_mask].tolist())
is_dataset1_image = df_comb['image_id'].isin(image_ids_dataset1)
is_dataset2_image = df_comb['image_id'].isin(image_ids_dataset2)

# Identify instances belonging to valid categories
is_dataset1_cat = df_comb['category_name'].isin(dataset1_cats)
is_dataset2_cat = df_comb['category_name'].isin(dataset2_cats)
is_dataset3_cat = df_comb['category_name'].isin(dataset3_cats)

# Identify valid instances for two halves of dataset with partial annotations
is_dataset1_instance = is_dataset1_image & is_dataset1_cat
is_dataset2_instance = is_dataset2_image & is_dataset2_cat
is_dataset12_instance = is_dataset1_instance | is_dataset2_instance

# Identify valid instances for dataset with complete annotations
image_ids_dataset12 = set(df_comb.loc[is_dataset12_instance, 'image_id'].tolist())
is_dataset3_image = df_comb['image_id'].isin(image_ids_dataset12)
is_dataset3_instance = is_dataset3_image & is_dataset3_cat

# Create dataframes for datasets with partial and complete annotations
df_dataset12 = df_comb[is_dataset12_instance]
df_dataset3 = df_comb[is_dataset3_instance]

In [5]:
# Get total instance and image counts
num_images1 = df_comb.loc[is_dataset1_instance, 'image_id'].nunique()
num_images2 = df_comb.loc[is_dataset2_instance, 'image_id'].nunique()
num_images3 = df_comb.loc[is_dataset3_instance, 'image_id'].nunique()
print(f'dataset 1 - num images: {num_images1}, num instances: {is_dataset1_instance.sum()}')
print(f'dataset 2 - num images: {num_images2}, num instances: {is_dataset2_instance.sum()}')
print(f'dataset 3 - num images: {num_images3}, num instances: {is_dataset3_instance.sum()}')

# Get per category instance counts 
pd.concat(
    [
        df_comb.loc[is_dataset1_instance, 'category_name'].value_counts().rename('dataset1'),
        df_comb.loc[is_dataset2_instance, 'category_name'].value_counts().rename('dataset2'),
        df_comb.loc[is_dataset3_instance, 'category_name'].value_counts().rename('dataset3'),
    ],
    axis=1
).sort_values(by='dataset3', ascending=False).astype(pd.Int64Dtype())

dataset 1 - num images: 1681, num instances: 7986
dataset 2 - num images: 1590, num instances: 7503
dataset 3 - num images: 3271, num instances: 17919


Unnamed: 0,dataset1,dataset2,dataset3
person,5630.0,5374.0,11004
chair,,802.0,1712
car,915.0,,1650
book,583.0,,1020
bottle,,589.0,937
cup,465.0,434.0,899
dining table,393.0,304.0,697


## Split datasets into train/val/test

In [6]:
# Define split parameters
train_frac = 0.7
val_frac = 0.1
np.random.seed(0)

# Assign images to different splits
num_train = round(len(image_ids) * train_frac)
num_val = round(len(image_ids) * val_frac)
image_ids_shuffle = np.random.permutation(image_ids)
image_ids_train = set(image_ids_shuffle[:num_train].tolist())
image_ids_val = set(image_ids_shuffle[num_train:num_train+num_val].tolist())
image_ids_test = set(image_ids_shuffle[num_train+num_val:].tolist())

# Create dataframes with different splits
df_dataset12_train = df_dataset12[df_dataset12['image_id'].isin(image_ids_train)]
df_dataset12_val = df_dataset12[df_dataset12['image_id'].isin(image_ids_val)]
df_dataset12_test = df_dataset12[df_dataset12['image_id'].isin(image_ids_test)]
df_dataset3_train = df_dataset3[df_dataset3['image_id'].isin(image_ids_train)]
df_dataset3_val = df_dataset3[df_dataset3['image_id'].isin(image_ids_val)]
df_dataset3_test = df_dataset3[df_dataset3['image_id'].isin(image_ids_test)]

In [7]:
# Check image and instance counts per split
df_split_count = pd.DataFrame(
    [
        [df_dataset12_train['image_id'].nunique(), df_dataset12_val['image_id'].nunique(), df_dataset12_test['image_id'].nunique()],
        [df_dataset3_train['image_id'].nunique(), df_dataset3_val['image_id'].nunique(), df_dataset3_test['image_id'].nunique()],
        [len(df_dataset12_train), len(df_dataset12_val), len(df_dataset12_test)],
        [len(df_dataset3_train), len(df_dataset3_val), len(df_dataset3_test)]
    ],
    index=['num_images_12', 'num_images_3', 'num_instances_12', 'num_instances_3'],
    columns=['train', 'val', 'test']
)
df_split_count['all'] = df_split_count.sum(axis=1)
df_split_count

Unnamed: 0,train,val,test,all
num_images_12,2315,321,635,3271
num_images_3,2315,321,635,3271
num_instances_12,10952,1539,2998,15489
num_instances_3,12616,1768,3535,17919


## Save annotation datasets to disk 

In [8]:
output_dir = './annotations'
dataset12_prefix = 'annotations_partial_'
dataset3_prefix = 'annotations_complete_'

df_dataset12_train.to_csv(os.path.join(output_dir, dataset12_prefix+'train.csv'))
df_dataset12_val.to_csv(os.path.join(output_dir, dataset12_prefix+'val.csv'))
df_dataset12_test.to_csv(os.path.join(output_dir, dataset12_prefix+'test.csv'))
df_dataset3_train.to_csv(os.path.join(output_dir, dataset3_prefix+'train.csv'))
df_dataset3_val.to_csv(os.path.join(output_dir, dataset3_prefix+'val.csv'))
df_dataset3_test.to_csv(os.path.join(output_dir, dataset3_prefix+'test.csv'))