## Vessel Detection Dataset

#### How to Create Query/Archive Sets

This notebook creates a query/archive split for the vessel detection dataset. Images are grouped into subsets, depending on the number of vessels contained (here we consider {0}, {1,...5}, {6...}), and then split into one (two) thirds of the dataset such that splits become balanced with respect to #vessels.

In [None]:
%load_ext autoreload
%matplotlib inline
%autoreload 2

import numpy as np

from pathlib import Path
import json

In [None]:
# Directory containing 'vessels_dataset_annotations.json' as well as a folder
# called 'tif' which holds all dataset images as *.tif
root_dir = ""
tif_dir = f"{root_dir}/tif"
json_file = f"{root_dir}/vessels_dataset_annotations.json"

with open(json_file) as f:
    d = json.load(f)
    annotations_dict = d

In [None]:
# path to zero_patches.json (also included in this repo)
zero_patches_json = f"{...}/zero_patches.json"

with open(zero_patches_json) as json_data:
    d = json.load(json_data)
    zero_patches = d['zero_patches']
    json_data.close()

num_zero_patches = len(zero_patches)
patches = [Path(file).stem for file in Path(tif_dir).glob("*.tif") if Path(file).stem not in zero_patches] 

num_patches = len(patches)
print(f"{num_patches}/{num_patches+num_zero_patches}, e.g.'{patches[0]}'")

In [None]:
def get_num_vessels(patch_name):
    annotation = annotations_dict[patch_name]
    return len(annotation['annotations'])

In [None]:
get_num_vessels('T10SEG_20190808T184921_TCI_crop_x-1408_y-1152')

In [None]:
# Acquire number of vessels for all images in the dataset
ll = np.zeros(shape=(num_patches, 2), dtype=object)

for i, p in enumerate(patches):
    ll[i][0] = p
    ll[i][1] = get_num_vessels(p)

In [None]:
# Divide the dataset into 3 parts, according to number of vessels contained
ll = np.asarray(ll)
ll.shape

s1 = np.where(ll[:, 1] == 0)[0]
s2 = np.where((0 < ll[:, 1]) & (ll[:, 1] < 15))[0]
s3 = np.where(15 <= ll[:, 1])[0]

print(s1.shape, s2.shape, s3.shape, s1.shape[0] + s2.shape[0] + s3.shape[0])

In [None]:
# Build query and archive sets such that both contain the same number of 
# vessels according our previous split into s1, s2, s3
num_archive_s1 = int(s1.shape[0]*0.33)
num_archive_s2 = int(s2.shape[0]*0.33)
num_archive_s3 = int(s3.shape[0]*0.33)

print(f"Respective archive sizes (~two thirds): {num_archive_s1}/{s1.shape[0]}, {num_archive_s2}/{s2.shape[0]}, {num_archive_s3}/{s3.shape[0]}, {s1.shape[0] + s2.shape[0] + s3.shape[0]}")

# query-set should be 2/3 of all data
s1_index = np.random.choice(s1.shape[0], num_archive_s1, replace=False)  
s2_index = np.random.choice(s2.shape[0], num_archive_s2, replace=False)  
s3_index = np.random.choice(s3.shape[0], num_archive_s3, replace=False)

s1_a = s1[s1_index]
s2_a = s2[s2_index]
s3_a = s3[s3_index]
print("Archive", s1_a.shape, s2_a.shape, s3_a.shape, s1_a.shape[0] + s2_a.shape[0] + s3_a.shape[0])


mask = np.ones(s1.size, dtype=bool)
mask[s1_index] = False
s1_q = s1[mask]

mask = np.ones(s2.size, dtype=bool)
mask[s2_index] = False
s2_q = s2[mask]

mask = np.ones(s3.size, dtype=bool)
mask[s3_index] = False
s3_q = s3[mask]
print("Query", s1_q.shape, s2_q.shape, s3_q.shape, s1_q.shape[0] + s2_q.shape[0] + s3_q.shape[0])


query = np.concatenate((s1_q, s2_q, s3_q))
archive = np.concatenate((s1_a, s2_a, s3_a))

print()
print('Final distribution query/archive', query.shape, archive.shape)
print('Sanity check: First 10 query indices:', query[:10])

In [None]:
# Index the corresponding query and archive patches to store the in two separate
# json files
patches = np.asarray(patches)
query_patches = patches[query]
archive_patches = patches[archive]

In [None]:
assert False, "Comment this line to write to disk."
d_query = dict({ 'query_patches': query_patches.tolist() })
d_archive = dict({ 'archive_patches': archive_patches.tolist() })

with open(f'./query_patches.json', 'w', encoding='utf-8') as f:
    print(len(list(d_query.keys())))
    json.dump(d_query, f, ensure_ascii=False, indent=2)

with open(f'./archive_patches.json', 'w', encoding='utf-8') as f:
    print(len(list(d_query.keys())))
    json.dump(d_archive, f, ensure_ascii=False, indent=2)
