In [1]:
import numpy as np
import struct
import random

def read_fvecs(filename, tp='<f'):
    vectors = []
    with open(filename, 'rb') as f:
        while True:
            dim_bytes = f.read(4)
            if not dim_bytes:
                break
            d = struct.unpack('<I', dim_bytes)[0]
            vec_bytes = f.read(d * 4)
            vector = struct.unpack(f'{tp[0]}{d}{tp[1]}', vec_bytes)
            vectors.append(np.array(vector, dtype=np.float32))
    return vectors


dataset = read_fvecs("./data/sift1m/sift_base.fvecs")
dataset = np.array(dataset)
num_queries = 10000 
num_vectors = len(dataset)
print("Num vectors:", num_vectors)
print("Num queries:", num_queries)

Num vectors: 1000000
Num queries: 10000


In [None]:
import struct

def write_fvecs(filename, tensor_data, tp='<f'):
    with open(filename, 'wb') as f:
        for vector in tensor_data:
            d = len(vector)
            f.write(struct.pack('<I', d))
            for value in vector:
                f.write(struct.pack(tp, value.item())) 

In [6]:
def power_law_sample(n, size, alpha):
    # Generates indices [0, n) with power-law probability ~ (rank + 1)^-alpha
    ranks = np.arange(1, n + 1)
    probs = ranks.astype(float) ** (-alpha)
    probs /= probs.sum()
    return np.random.choice(n, size=size, p=probs)


u_indices = np.random.choice(num_vectors, size=int(num_queries / 2), replace=False)
u_vectors = dataset[u_indices]

alpha = 3
pl_indices = power_law_sample(num_vectors, int(num_queries / 2), alpha)
pl_vectors = dataset[pl_indices]

query_vectors = np.concatenate([u_vectors, pl_vectors], axis=0)

write_fvecs(f'sifmt1-skew-{alpha}-queries.fvecs', query_vectors)

In [7]:
import faiss

dimension = query_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(dataset)

K = 100
distances, indices = index.search(query_vectors, K)
print(indices[0])

[ 13434  15773 667989 393526 644406 410462 306753 141077 516242  58051
 392534  62636 397013 998023 486114  58064 514730 486731 338616 859408
 555848 946992 632838 832009 541317 594692 671766 823870 141329 668074
 187625 958207 486669 711332  49373 195832 274648 656977 221442 311834
 922634 307940 632339 100773 195054 273870  13375 961244 486445 772332
  60180 990004 475882 704721 101052 968342 335811 668047  60314 481039
 553789 378676  76502 670157 558437 905083  15860 135242 720726 991208
 485003 158757 397183 536492 900136 773150 670494 416898 947095 951650
 632590 281344 103248 505118 859443 552736 990267 990109 922805 448402
 859624  13578 505476 822346 522283 442176 346684 958143 886123 880216]


In [8]:
write_fvecs(f'sift1m-skew-{alpha}-distances-100.fvecs', distances)
write_fvecs(f'sift1m-skew-{alpha}-indices-100.fvecs', indices, "<i")