In [26]:
# 首先构建训练数据和测试数据

import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 1                           # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000

In [27]:
import faiss                   # make faiss available
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
100000


In [28]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xq, k)     # actual search
print(xq)
print(I[:5])                   # neighbors of the 5 first queries
print(D[:5])                   # neighbors of the 5 last queries

[[0.81432974 0.7409969  0.8915324  0.02642949 0.24954738 0.75948536
  0.33756447 0.0388501  0.06253924 0.04496585 0.6500265  0.14300306
  0.10555115 0.7554373  0.8733019  0.91065574 0.949595   0.4678057
  0.7957018  0.06088004 0.5086471  0.7798314  0.42084002 0.49393055
  0.51475203 0.59598726 0.8164803  0.6895009  0.42959374 0.8051452
  0.08729313 0.0663529  0.0201008  0.677681   0.8932952  0.41095483
  0.16123782 0.14441694 0.1827087  0.6027973  0.48050675 0.9779244
  0.6363663  0.18064964 0.13591066 0.8322959  0.5291623  0.31196308
  0.45191374 0.5528666  0.06121221 0.83773404 0.44387    0.1658945
  0.01169583 0.33254945 0.22699533 0.46597633 0.06443579 0.9420383
  0.36051401 0.72459674 0.893881   0.6574571 ]]
[[381 207 210 477]]
[[6.8155117 6.889466  7.395678  7.4290204]]


In [29]:
import faiss
from faiss import normalize_L2
import numpy as np
import time

d = 64                           # dimension
nb = 100005                      # database size
np.random.seed(1234)             # make reproducible
training_vectors= np.random.random((nb, d)).astype('float32')*10

normalize_L2(training_vectors)

nlist = 1000  # 聚类中心的个数
k = 50 #邻居个数
quantizer = faiss.IndexFlatIP(d)  # the other index，需要以其他index作为基础

index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
# by default it performs inner-product search
assert not index.is_trained
index.train(training_vectors)
assert index.is_trained
index.nprobe = 300  # default nprobe is 1, try a few more
index.add(training_vectors)  # add may be a bit slower as well
print("done!")

done!


In [30]:
t1 = time.time()
D, I = index.search(training_vectors[:100], k)  # actual search
t2 = time.time()
print('faiss kmeans result times {}'.format(t2-t1))
print(D[:5])  # neighbors of the 5 first queries
print(I[:5])

faiss kmeans result times 0.04693150520324707
[[0.9999999  0.8984151  0.89619327 0.8937164  0.8911887  0.88934976
  0.8856708  0.8839439  0.883367   0.8825256  0.8813614  0.8810687
  0.8809454  0.88023424 0.87951064 0.8779743  0.87793237 0.87743706
  0.8770516  0.87704283 0.87662053 0.8765849  0.8763714  0.8759678
  0.87589985 0.87579435 0.87551665 0.8750682  0.8745278  0.874143
  0.87397784 0.8734885  0.87322885 0.8728647  0.8728574  0.8724403
  0.87180114 0.871781   0.8717619  0.8716021  0.8709686  0.8706926
  0.8706677  0.8703424  0.86983097 0.8698109  0.8697542  0.869646
  0.8696394  0.8690887 ]
 [0.9999999  0.87635607 0.87428653 0.8733563  0.8729164  0.87241495
  0.87225837 0.8715584  0.87107486 0.87058187 0.86958843 0.86856234
  0.8685419  0.86817956 0.86762565 0.8674512  0.86723524 0.8658652
  0.8654198  0.8650643  0.86472183 0.864476   0.86428887 0.8642784
  0.86426514 0.86414313 0.86399287 0.8637527  0.863446   0.86335266
  0.8632122  0.86297935 0.86289334 0.8627401  0.8626386