# YFCC-100M

In [1]:
from pathlib import Path
data_dir = Path('/localdata/jsu068')

In [2]:
import numpy as np
from datasets import YFCC100MDataset

In [3]:
ds = YFCC100MDataset()
ds.prepare()

downloading https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/yfcc100M/query.public.100K.u8bin -> /localdata/jsu068/yfcc/query.public.100K.u8bin...
  [0.84 s] downloaded 18.31 MiB / 18.31 MiB at 21.77 MiB/s   
download finished in 0.84 s, total size 19200008 bytes
downloading https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/yfcc100M/GT.public.ibin -> /localdata/jsu068/yfcc/GT.public.ibin...
  [0.47 s] downloaded 7.63 MiB / 7.63 MiB at 16.18 MiB/s   
download finished in 0.47 s, total size 8000008 bytes
downloading https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/yfcc100M/query.private.2727415019.100K.u8bin -> /localdata/jsu068/yfcc/query.private.2727415019.100K.u8bin...
  [0.76 s] downloaded 18.31 MiB / 18.31 MiB at 24.15 MiB/s   
download finished in 0.76 s, total size 19200008 bytes
downloading https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/yfcc100M/GT.private.2727415019.ibin -> /localdata/jsu068/yfcc/GT.private.2727415019.ibin...
  [1.54 

In [4]:
ds.__str__()

'Dataset YFCC100MDataset in dimension 192, with distance euclidean, search_type knn_filtered, size: Q 100000 B 10000000'

In [5]:
ds.get_dataset_metadata().shape, ds.get_queries_metadata().shape

((10000000, 200386), (100000, 200386))

In [18]:
dataset_metadata = ds.get_dataset_metadata()  # (10000000, 200386)
dataset_metadata.data.shape

(108210476,)

In [19]:
query_metadata = ds.get_queries_metadata()
query_metadata.indices.shape

(138374,)

In [20]:
rows, cols = dataset_metadata.nonzero()
rows.shape, cols.shape

((108210476,), (108210476,))

In [21]:
from collections import defaultdict
filter_dict = defaultdict(list)
for row, col in zip(rows, cols):
    filter_dict[row].append(col)

In [22]:
len(filter_dict)

9998928

In [23]:
# {k: v for k, v in filter_dict.items() if not v}
not_in_list = [i for i in range(10000000) if i not in filter_dict]
len(not_in_list)

1072

In [24]:
import os

labels_dir = data_dir / "yfcc"
file_path = os.path.join(labels_dir, "yfcc.filter.base.bin")
label_type = np.uint32

data = []
it = 0

for idx, labels in filter_dict.items():
    while it < len(not_in_list) and idx > not_in_list[it]:
        data.append([])
        it += 1
    data.append(labels)

assert(len(data) == 10000000)
total_labels = 0
# separator = np.iinfo(label_type).max - 1
separator = 0

with open(file_path, "wb") as f:
    # total points
    shape = len(data)
    f.write(shape.to_bytes(4, byteorder='little'))

    # data
    for labels in data:
        label_array = np.array(labels, dtype=label_type)
        # data
        f.write(label_array.tobytes())
        # separator max
        f.write(separator.to_bytes(4, byteorder='little'))
        total_labels += label_array.shape[0]

print(f"{len(data)} points with {total_labels} labels are written to {file_path}")

10000000 points with 108210476 labels are written to /localdata/jsu068/yfcc/yfcc.filter.base.bin


In [25]:
single = []
for idx, labels in filter_dict.items():
    if len(labels) == 1:
        single.append(labels[0])
len(single), 0 in single

(13122, False)

In [26]:
rows, cols = query_metadata.nonzero()

In [27]:
filter_dict = defaultdict(list)
for row, col in zip(rows, cols):
    filter_dict[row].append(col)

In [28]:
len(filter_dict)

100000

In [29]:
import os

file_path = os.path.join(labels_dir, "yfcc.filter.query.bin")
label_type = np.uint32

data = []
it = 0

for idx, labels in filter_dict.items():
    data.append(labels)

assert(len(data) == 100000)
total_labels = 0
# separator = np.iinfo(label_type).max - 
separator = 0

with open(file_path, "wb") as f:
    # total points
    shape = len(data)
    f.write(shape.to_bytes(4, byteorder='little'))

    # data
    for labels in data:
        label_array = np.array(labels, dtype=label_type)
        # data
        f.write(label_array.tobytes())
        # separator max
        f.write(separator.to_bytes(4, byteorder='little'))
        total_labels += label_array.shape[0]

print(f"{len(data)} points with {total_labels} labels are written to {file_path}")

100000 points with 138374 labels are written to /localdata/jsu068/yfcc/yfcc.filter.query.bin


# MS-MACRO

In [20]:
import numpy as np
import struct

In [21]:
from pathlib import Path
data_dir = Path('/localdata/jsu068')

macro_dir = data_dir / "marco"
embed_dir = macro_dir / "embedding"
query_dir = macro_dir / "query"

In [22]:
# embedding
with open(embed_dir / "vectors.bin", "rb") as f:
    n = struct.unpack('I', f.read(4))[0]
    dim = struct.unpack('I', f.read(4))[0]

vectors = np.memmap(embed_dir / "vectors.bin", dtype=np.float32, mode='r', shape=(n, dim))

n, dim, vectors.shape

(101070374, 768, (101070374, 768))

In [23]:
meta = np.fromfile(embed_dir / "meta.bin", dtype=np.uint8)
metaidx = np.fromfile(embed_dir / "metaidx.bin", dtype=np.int32)
meta.shape, metaidx.shape

((909486248,), (202140751,))

In [24]:
data = np.zeros(n, dtype=np.uint32)
i = 0

for idx in list(metaidx)[2:]:
    if idx == 0:
        continue
    
    idx -= 1
    data[i] = meta[idx]
    i += 1
    
data.shape

(101070374,)

In [25]:
import os

file_path = os.path.join(embed_dir, "macro.filter.base.bin")
label_type = np.uint32
total_labels = 0
separator = np.iinfo(label_type).max

with open(file_path, "wb") as f:
    # total points
    shape = len(data)
    f.write(shape.to_bytes(4, byteorder='little'))

    # data
    for label in data:
        # label_array = np.array(labels, dtype=label_type)
        # data
        f.write(label.tobytes())
        # separator max
        f.write(separator.to_bytes(4, byteorder='little'))
        total_labels += 1

print(f"{len(data)} points with {total_labels} labels are written to {file_path}")

101070374 points with 101070374 labels are written to /localdata/jsu068/marco/embedding/macro.filter.base.bin


In [26]:
# query
with open(query_dir / "vectors.bin", "rb") as f:
    n = struct.unpack('I', f.read(4))[0]
    dim = struct.unpack('I', f.read(4))[0]

vectors = np.memmap(query_dir / "vectors.bin", dtype=np.float32, mode='r', shape=(n, dim))

n, dim, vectors.shape

(9376, 768, (9376, 768))

In [27]:
meta = np.fromfile(query_dir / "meta.bin", dtype=np.uint8)
metaidx = np.fromfile(query_dir / "metaidx.bin", dtype=np.int32)
meta.shape, metaidx.shape

((36397,), (18755,))

In [28]:
data = np.zeros(n, dtype=np.uint32)
i = 0

for idx in list(metaidx)[2:]:
    if idx == 0:
        continue
    
    idx -= 1
    data[i] = meta[idx]
    i += 1
    
data.shape

(9376,)

In [29]:
import os

file_path = os.path.join(query_dir, "macro.filter.query.bin")
label_type = np.uint32
total_labels = 0
separator = np.iinfo(label_type).max

with open(file_path, "wb") as f:
    # total points
    shape = len(data)
    f.write(shape.to_bytes(4, byteorder='little'))

    # data
    for label in data:
        # label_array = np.array(labels, dtype=label_type)
        # data
        f.write(label.tobytes())
        # separator max
        f.write(separator.to_bytes(4, byteorder='little'))
        total_labels += 1

print(f"{len(data)} points with {total_labels} labels are written to {file_path}")

9376 points with 9376 labels are written to /localdata/jsu068/marco/query/macro.filter.query.bin
