### For Exact Search

In [1]:
import faiss
import numpy as np

D = 128
N = 10000
X = np.random.random((N, D)).astype(np.float32)  # inputs of faiss must be float32

# Setup
index = faiss.IndexFlatL2(D)
index.add(X)

# Search
topk = 4
dists, ids = index.search(x=X[:3], k=topk)  # Use the top three vectors for querying
print(type(dists), dists.dtype, dists.shape)  # <class 'numpy.ndarray'> float32 (3, 4)
print(type(ids), ids.dtype, ids.shape)  # <class 'numpy.ndarray'> int64 (3, 4)

# Show params
print("N:", index.ntotal)
print("D:", index.d)

<class 'numpy.ndarray'> float32 (3, 4)
<class 'numpy.ndarray'> int64 (3, 4)
N: 10000
D: 128


In [None]:
import faiss
import numpy as np
import os

D = 128
N = 10000
X = np.random.random((N, D)).astype(np.float32)  # inputs of faiss must be float32

# GPU config
gpu_ids = "0"  # can be e.g. "3,4" for multiple GPUs 
os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids

# Setup
cpu_index = faiss.IndexFlatL2(D)
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
gpu_index.add(X)

# Search
topk = 4
dists, ids = gpu_index.search(x=X[:3], k=topk)

### For Approximate Search

In [2]:
import faiss
import numpy as np

D = 128
N = 1000000
Xt = np.random.random((10000, D)).astype(np.float32)  # 10000 vectors for training
X = np.random.random((N, D)).astype(np.float32)

# Param of PQ
M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
# Param of IVF
nlist = 1000  # The number of cells (space partition). Typical value is sqrt(N)
# Param of HNSW
hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32

# Setup
quantizer = faiss.IndexHNSWFlat(D, hnsw_m)
index = faiss.IndexIVFPQ(quantizer, D, nlist, M, nbits)

# Train
index.train(Xt)

# Add
index.add(X)

In [3]:
# Search
index.nprobe = 8  # Runtime param. The number of cells that are visited for search.
topk = 4
dists, ids = index.search(x=X[:3], k=topk)

# Show params
print("D:", index.d)
print("N:", index.ntotal) 
print("M:", index.pq.M)
print("nbits:", index.pq.nbits)
print("nlist:", index.nlist)
print("nprobe:", index.nprobe)

D: 128
N: 1000000
M: 16
nbits: 8
nlist: 1000
nprobe: 8


In [4]:
%%timeit
dists, ids = index.search(x=X[:3], k=topk)

998 µs ± 440 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
faiss.write_index(index, "superfastindex.bin")

In [6]:
index2 = faiss.read_index("faiss_50K_update3.index") 

In [5]:
import faiss
import numpy as np

D = 128
N = 10000
K = 10  # The number of clusters
X = np.random.random((N, D)).astype(np.float32)

# Setup
kmeans = faiss.Kmeans(d=D, k=K, niter=20, verbose=True)
# For GPU(s), run the following line. This will use all GPUs
# kmeans = faiss.Kmeans(d=D, k=K, niter=20, verbose=True, gpu=True)

# Run clustering
kmeans.train(X)

# Error for each iteration
print(kmeans.obj)  # array with 20 elements

# Centroids after clustering
print(kmeans.centroids.shape)  # (10, 128)

# The assignment for each vector.
dists, ids = kmeans.index.search(X, 1)  # Need to run NN search again
print(ids.shape)  # (10000, 1)

# Params
print("D:", kmeans.d)
print("K:", kmeans.k)
print("niter:", kmeans.cp.niter)

[47306.57421875 26601.49804688 26521.53125    26477.765625
 26449.046875   26431.0546875  26420.98632812 26413.05273438
 26406.87304688 26401.67773438 26397.45898438 26394.38671875
 26392.54296875 26390.69726562 26389.109375   26387.72265625
 26386.38476562 26385.0546875  26384.19726562 26383.65429688]
(10, 128)
(10000, 1)
D: 128
K: 10
niter: 20


## Set No of threads to use < Default is all threads >

In [8]:
# faiss.omp_set_num_threads(1)

## Hamming Distance

In [9]:
def pairwise_hamming_dis(a, b):
    """ compute the pairwise Hamming distances between two matrices """
    na, d = a.shape
    nb, d2 = b.shape
    assert d == d2

    dis = np.empty((na, nb), dtype='int32')

    faiss.hammings(
        faiss.swig_ptr(a), faiss.swig_ptr(b),
        na, nb, d,
        faiss.swig_ptr(dis)
    )
    return dis

# Each vector must be the form of "uint8 * ncodes",
# where ncodes % 8 == 0
xq = np.array([[0, 0, 0, 0, 0, 0, 0, 2],     # [0, 0, ..., 1, 0] <- 64 bits (ncodes=8)
               [0, 0, 0, 0, 0, 0, 0, 3]],    # [0, 0, ..., 1, 1]
               dtype=np.uint8)
xb = np.array([[0, 0, 0, 0, 0, 0, 0, 2],     # [0, 0, ..., 1, 0]
               [0, 0, 0, 0, 0, 0, 0, 0],     # [0, 0, ..., 0, 0]
               [0, 0, 0, 0, 0, 0, 0, 1]],    # [0, 0, ..., 0, 1]
               dtype=np.uint8)

dis = pairwise_hamming_dis(xq, xb)
print(dis)

[[0 1 2]
 [1 2 1]]


## Merge Results from Several Index

In [47]:
import faiss
import numpy as np

D = 128
N = 10000
Nq = 1  # make it 3 for fun
X = np.random.random((N, D)).astype(np.float32)
Xq = np.random.random((Nq, D)).astype(np.float32)

# Setup
index = faiss.IndexFlatL2(D)
index.add(X)

# Search
topk = 10
dists, ids = index.search(x=Xq, k=topk)
print("dists:", dists)
print("ids:", ids)


# Setup with two indices
index1 = faiss.IndexFlatL2(D)
index1.add(X[:2000])   # Store the first 2000 vectors
index2 = faiss.IndexFlatL2(D)
index2.add(X[2000:])   # Store the remaining

# Search for both indices
dists1, ids1 = index1.search(x=Xq, k=topk)
dists2, ids2 = index2.search(x=Xq, k=topk)

# Merge results
result_heap = faiss.ResultHeap(nq=Nq, k=topk)
result_heap.add_result(D=dists1, I=ids1)
result_heap.add_result(D=dists2, I=ids2 + 2000)  # 2000 is an offset
result_heap.finalize()
print("dists:", result_heap.D)
print("ids:", result_heap.I)

assert np.array_equal(dists, result_heap.D)
assert np.array_equal(ids, result_heap.I)

dists: [[13.268892  13.48959   14.0531435 14.133966  14.182329  14.212965
  14.229703  14.2562    14.32106   14.3277445]]
ids: [[4666 2341 5347 7811 5073 5141 7212 3491 7631 9764]]
dists: [[13.268892  13.48959   14.0531435 14.133966  14.182329  14.212965
  14.229703  14.2562    14.32106   14.3277445]]
ids: [[4666 2341 5347 7811 5073 5141 7212 3491 7631 9764]]


In [48]:
np.in1d(ids,result_heap.I)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [49]:
dict1 = dict(zip(list(ids1[0]),list(dists1[0])))
dict2 = dict(zip(list(ids2[0] + 2000),list(dists2[0])))

In [50]:
dict1

{913: 14.479799,
 1410: 14.926694,
 1058: 15.330411,
 1011: 15.484995,
 888: 15.526463,
 231: 15.604352,
 1113: 15.649547,
 1360: 15.684461,
 43: 15.744425,
 1987: 15.778402}

In [51]:
dict2

{4666: 13.268892,
 2341: 13.48959,
 5347: 14.0531435,
 7811: 14.133966,
 5073: 14.182329,
 5141: 14.212965,
 7212: 14.229703,
 3491: 14.2562,
 7631: 14.32106,
 9764: 14.3277445}

In [52]:
dict1.update(dict2)

In [53]:
dict1

{913: 14.479799,
 1410: 14.926694,
 1058: 15.330411,
 1011: 15.484995,
 888: 15.526463,
 231: 15.604352,
 1113: 15.649547,
 1360: 15.684461,
 43: 15.744425,
 1987: 15.778402,
 4666: 13.268892,
 2341: 13.48959,
 5347: 14.0531435,
 7811: 14.133966,
 5073: 14.182329,
 5141: 14.212965,
 7212: 14.229703,
 3491: 14.2562,
 7631: 14.32106,
 9764: 14.3277445}

In [54]:
my_results = {k: v for k, v in sorted(dict1.items(), key=lambda item: item[1])}

In [55]:
my_results

{4666: 13.268892,
 2341: 13.48959,
 5347: 14.0531435,
 7811: 14.133966,
 5073: 14.182329,
 5141: 14.212965,
 7212: 14.229703,
 3491: 14.2562,
 7631: 14.32106,
 9764: 14.3277445,
 913: 14.479799,
 1410: 14.926694,
 1058: 15.330411,
 1011: 15.484995,
 888: 15.526463,
 231: 15.604352,
 1113: 15.649547,
 1360: 15.684461,
 43: 15.744425,
 1987: 15.778402}

In [56]:
myresults_index = np.array(list(my_results.keys())[:10])

In [57]:
myresults_index

array([4666, 2341, 5347, 7811, 5073, 5141, 7212, 3491, 7631, 9764],
      dtype=int64)

In [58]:
## My way
np.in1d(ids[0],myresults_index)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [59]:
myresults_index

array([4666, 2341, 5347, 7811, 5073, 5141, 7212, 3491, 7631, 9764],
      dtype=int64)

In [60]:
result_heap.I

array([[4666, 2341, 5347, 7811, 5073, 5141, 7212, 3491, 7631, 9764]],
      dtype=int64)