In [2]:
!pip install annoy faiss



### Prepare the dataset

In [3]:
import numpy as np 

sift = np.load("dataset/sift-128-euclidean.npy")
sift.shape

(1000000, 128)

### Test the index

In [4]:
import time
from annoy import AnnoyIndex
import faiss

faiss.omp_set_num_threads(1)

In [6]:
index = AnnoyIndex(f=sift.shape[1], metric='euclidean')

for i in range(sift.shape[0]):
    index.add_item(i, vector=sift[i, :])

index.build(n_trees=100)

I = index.get_nns_by_vector(vector=sift[0], n=100)
print(I)

flat = faiss.IndexFlatL2(sift.shape[1])
flat.add(sift)

D, FLAT_I = flat.search(sift[0].reshape(1, sift.shape[1]), k=100) 
print(FLAT_I)

[0, 2, 6, 83606, 631203, 677834, 246710, 677793, 480592, 10336, 658180, 799350, 738996, 516942, 451321, 725637, 480903, 719046, 799488, 500141, 466880, 529593, 688749, 558961, 686828, 183625, 432221, 678008, 772144, 89757, 432521, 633385, 596413, 679499, 769701, 236210, 528709, 216605, 738730, 559065, 134358, 631964, 206873, 271151, 851764, 261934, 225014, 404206, 632106, 256176, 547359, 514307, 630017, 705267, 216395, 419350, 204933, 269211, 197644, 276460, 876717, 95134, 719140, 407157, 79225, 808018, 559250, 525531, 162637, 764500, 547845, 724103, 547004, 219183, 832018, 533417, 42705, 197990, 276806, 720756, 116581, 729207, 780424, 432005, 81324, 529193, 183256, 735092, 558981, 256786, 637470, 20862, 181079, 748847, 965764, 42033, 21982, 770114, 95266, 419340]
[[     0      2      6  83606 631203 677834 246710 677793 480592  10336
  658180 799350 738996 516942 965310 451321 725637 480903 719046 248230
  799488 500141 799404 466880 529593 688749 558961 686828 183625 432221
  532473 

In [7]:
# Calculate the recall

sum([1 for i in I if i in FLAT_I]) / FLAT_I.size

0.82

### Benchmark

In [1]:
import time 

tree_nums = [1, 10, 30, 50, 70, 90, 110, 130]
indexes=[]
build_time=[]

def build_index(data, n_trees, metric='euclidean'):
    index = AnnoyIndex(f=data.shape[1], metric='euclidean')
    for i in range(data.shape[0]):
        index.add_item(i, vector=data[i, :])

    index.build(n_trees)
    return index

for t_num in tree_nums:
    start = time.time()
    index = build_index(sift, n_trees=t_num)
    btime = time.time() - start
    build_time.append(btime)
    indexes.append(index)
    
print(build_time)

NameError: name 'sift' is not defined