In [1]:
import numpy as np
from datasets import YFCC100MDataset

In [2]:
ds = YFCC100MDataset()
ds.prepare()

file data/yfcc100M/query.public.100K.u8bin already exists
file data/yfcc100M/GT.public.ibin already exists
file data/yfcc100M/query.private.2727415019.100K.u8bin already exists
file data/yfcc100M/GT.private.2727415019.ibin already exists
file data/yfcc100M/base.10M.u8bin.crop_nb_10000000 already exists
file data/yfcc100M/base.metadata.10M.spmat already exists
file data/yfcc100M/query.metadata.public.100K.spmat already exists
file data/yfcc100M/query.metadata.private.2727415019.100K.spmat already exists


In [3]:
ds.__str__()

'Dataset YFCC100MDataset in dimension 192, with distance euclidean, search_type knn_filtered, size: Q 100000 B 10000000'

In [4]:
ds.get_dataset_metadata().shape, ds.get_queries_metadata().shape

((10000000, 200386), (100000, 200386))

In [5]:
dataset_metadata = ds.get_dataset_metadata()  # (10000000, 200386)
dataset_metadata.data.shape

(108210476,)

In [6]:
query_metadata = ds.get_queries_metadata()
query_metadata.indices.shape

(138374,)

In [7]:
rows, cols = dataset_metadata.nonzero()
rows.shape, cols.shape

((108210476,), (108210476,))

In [8]:
from collections import defaultdict
filter_dict = defaultdict(list)
for row, col in zip(rows, cols):
    filter_dict[row].append(col)

In [9]:
len(filter_dict)

9998928

In [10]:
# {k: v for k, v in filter_dict.items() if not v}
not_in_list = [i for i in range(10000000) if i not in filter_dict]
len(not_in_list)

1072

In [11]:
not_in_list

[4226,
 6721,
 8179,
 14742,
 16553,
 36788,
 42393,
 49373,
 51419,
 52794,
 66351,
 75228,
 93513,
 93920,
 96670,
 100971,
 103064,
 107201,
 109326,
 121544,
 122805,
 124021,
 126633,
 126967,
 133137,
 136088,
 136465,
 149946,
 151072,
 153694,
 171150,
 180124,
 185883,
 189138,
 193323,
 201331,
 201406,
 203543,
 214500,
 216431,
 223213,
 223214,
 224812,
 233664,
 237987,
 244277,
 256521,
 265424,
 270323,
 275563,
 275564,
 282189,
 285938,
 307827,
 316545,
 327649,
 333647,
 338345,
 345790,
 351837,
 354057,
 355827,
 360914,
 366118,
 368502,
 368503,
 380344,
 384752,
 402655,
 407072,
 412411,
 416662,
 417833,
 443524,
 445148,
 454580,
 469269,
 476881,
 476891,
 478425,
 481776,
 485373,
 485387,
 489697,
 491360,
 492433,
 492606,
 504715,
 505056,
 515563,
 517082,
 517083,
 523183,
 530219,
 531873,
 543741,
 544246,
 550055,
 556221,
 558112,
 560036,
 564995,
 567143,
 571972,
 577387,
 588123,
 588637,
 599651,
 599778,
 618801,
 629308,
 630450,
 636010,
 

In [12]:
import os

labels_dir = "labels/"
file_path = os.path.join(labels_dir, "yfcc_base_10M.ubin")
dtype = np.uint32

data = []
it = 0

for idx, labels in filter_dict.items():
    while it < len(not_in_list) and idx > not_in_list[it]:
        data.append([])
        it += 1
    data.append(labels)

assert(len(data) == 10000000)
total_labels = 0
separator = np.iinfo(np.uint32).max

with open(file_path, "wb") as f:
    # total points
    shape = len(data)
    f.write(shape.to_bytes(4, byteorder='little'))

    # data
    for labels in data:
        label_array = np.array(labels, dtype=np.uint32)
        # data
        f.write(label_array.tobytes())
        # separator max
        f.write(separator.to_bytes(4, byteorder='little'))
        total_labels += label_array.shape[0]

print(f"{len(data)} points with {total_labels} labels are written to {file_path}")

10000000 points with 108210476 labels are written to labels/yfcc_base_10M.ubin


In [13]:
single = []
for idx, labels in filter_dict.items():
    if len(labels) == 1:
        single.append(labels[0])
len(single), 0 in single

(13122, False)

In [14]:
rows, cols = query_metadata.nonzero()

In [15]:
filter_dict = defaultdict(list)
for row, col in zip(rows, cols):
    filter_dict[row].append(col)

In [16]:
len(filter_dict)

100000

In [17]:
import os

labels_dir = "labels/"
file_path = os.path.join(labels_dir, "yfcc_query_100k.ubin")
dtype = np.uint32

data = []
it = 0

for idx, labels in filter_dict.items():
    data.append(labels)

assert(len(data) == 100000)
total_labels = 0
separator = np.iinfo(np.uint32).max

with open(file_path, "wb") as f:
    # total points
    shape = len(data)
    f.write(shape.to_bytes(4, byteorder='little'))

    # data
    for labels in data:
        label_array = np.array(labels, dtype=np.uint32)
        # data
        f.write(label_array.tobytes())
        # separator max
        f.write(separator.to_bytes(4, byteorder='little'))
        total_labels += label_array.shape[0]

print(f"{len(data)} points with {total_labels} labels are written to {file_path}")

100000 points with 138374 labels are written to labels/yfcc_query_100k.ubin
