# ScaNN Demo with GloVe Dataset

In [11]:
import numpy as np
import h5py
import os
import requests
import tempfile
import time
import struct
import itertools

import scann

from core_utils import read_float_binary_file,read_i8_binary_file,read_u8_binary_file,write_float_binary_file,read_int32_binary_file,compute_recall

### Download dataset

In [21]:

[nb, nd, dataset] = read_float_binary_file('/home/rakri/arxiv/wikipedia_large/wikipedia_base_100K.bin')
[nq, ndq, queries] = read_float_binary_file('/home/rakri/arxiv/wikipedia_large/wikipedia_query.bin')
[nqgt, nkgt, gt] = read_int32_binary_file('/home/rakri/wiki100k_gs100.bin')

#names and parameters
dataset_name='wiki100'
L = 2000
qtau = 0.2
index_name = '/home/rakri/indices/scann_wiki_'+dataset_name+'l='+str(L)+'_qtau='+str(qtau) 

#for scann, search params are num leaves and num reorder

leaf_list = [x for x in range(50, 251, 50)]
reorder_list = [x for x in range(200, 501, 100)]

#search_params = [(lc, rc) for lc, rc in zip(leaf_list, reorder_list)]
search_params = list(itertools.product(leaf_list, reorder_list))

print (search_params)
#search_params = [(10, 100), (20, 100), (30, 100), (40), (50), (60), (70), (80), (90), (100)]

[(50, 200), (50, 300), (50, 400), (50, 500), (100, 200), (100, 300), (100, 400), (100, 500), (150, 200), (150, 300), (150, 400), (150, 500), (200, 200), (200, 300), (200, 400), (200, 500), (250, 200), (250, 300), (250, 400), (250, 500)]


### Create ScaNN searcher

In [3]:
#normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
#searcher = scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product").tree(
#    num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000).score_ah(
#    2, anisotropic_quantization_threshold=0.2).reorder(100).build()


if not os.path.exists(index_name):
    os.makedirs(index_name)
    searcher = scann.scann_ops_pybind.builder(dataset, 10, "squared_l2").tree(
        num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000).score_ah(
            2, anisotropic_quantization_threshold=0.2).reorder(100).build()
    searcher.serialize(index_name)
else:
    print("Index exists, so loading")
    searcher = scann.scann_ops_pybind.load_searcher(index_name)



Index exists, so loading


### ScaNN interface features

In [26]:
print("leaves\treorder\tRecall\t\ttime")
nk = 10
for param in search_params:
    (lc, rc) = param
    neighbors = np.zeros((nq,nk))
    start = time.time()
    for i in range(0,nq):
        neighbors[i,:], distances = searcher.search(queries[i], final_num_neighbors=nk,pre_reorder_num_neighbors=rc, leaves_to_search=lc)
    end = time.time()
    recall = compute_recall(neighbors, gt[:,:10])
    print(lc,"\t",rc,"\t",recall,"\t", 1000000*(end-start)/nq)

leaves	reorder	Recall		time
50 	 200 	 0.2209 	 459.31763648986816
50 	 300 	 0.2209 	 471.69079780578613
50 	 400 	 0.2209 	 491.87893867492676
50 	 500 	 0.2209 	 516.4154529571533
100 	 200 	 0.34906 	 519.577693939209
100 	 300 	 0.34912 	 546.4325428009033
100 	 400 	 0.34912 	 577.4599075317383
100 	 500 	 0.34912 	 596.8708038330078
150 	 200 	 0.44786 	 594.1460132598877
150 	 300 	 0.44808 	 618.553352355957
150 	 400 	 0.44808 	 648.908805847168
150 	 500 	 0.44808 	 682.8195571899414
200 	 200 	 0.4527 	 664.1870498657227
200 	 300 	 0.45328 	 692.0827388763428
200 	 400 	 0.45328 	 721.359395980835
200 	 500 	 0.45328 	 753.5184383392334
250 	 200 	 0.47038 	 738.4162425994873
250 	 300 	 0.47112 	 772.3684310913086
250 	 400 	 0.47114 	 807.6549530029297
250 	 500 	 0.47114 	 832.7040195465088


In [25]:
'''
# sample codes from google repo

with tempfile.TemporaryDirectory() as tmp:
    response = requests.get("http://ann-benchmarks.com/glove-100-angular.hdf5")
    loc = os.path.join(tmp, "glove.hdf5")
    with open(loc, 'wb') as f:
        f.write(response.content)
    
    glove_h5py = h5py.File(loc, "r")
list(glove_h5py.keys())    

#dataset = glove_h5py['train']
#queries = glove_h5py['test']
#gt = glove_h5py['neighbors']

# this will search the top 100 of the 2000 leaves, and compute
# the exact dot products of the top 100 candidates from asymmetric
# hashing to get the final top 10 candidates.
start = time.time()
neighbors, distances = searcher.search_batched(queries)
end = time.time()
# we are given top 100 neighbors in the ground truth, so select top 10
print("Recall:", compute_recall(neighbors, gt[:, :10]))
print("Latency:", (1000000*(end - start))/nq)

# increasing the leaves to search increases recall at the cost of speed
start = time.time()
neighbors, distances = searcher.search_batched(queries, leaves_to_search=250)
end = time.time()

#print("Recall:", compute_recall(neighbors, gt[:, :10]))
print("Recall:", compute_recall(neighbors, gt[:, :10]))
print("Latency:", (1000000*(end - start))/nq)

# increasing reordering (the exact scoring of top AH candidates) has a similar effect.
start = time.time()
neighbors, distances = searcher.search_batched(queries, leaves_to_search=250, pre_reorder_num_neighbors=500)
end = time.time()

print("Recall:", compute_recall(neighbors, gt[:, :10]))
print("Latency:", (1000000*(end - start))/nq)

# we can also dynamically configure the number of neighbors returned
# currently returns 10 as configued in ScannBuilder()
neighbors, distances = searcher.search_batched(queries)
print(neighbors.shape, distances.shape)

# now returns 20
neighbors, distances = searcher.search_batched(queries, final_num_neighbors=20)
print(neighbors.shape, distances.shape)

# we have been exclusively calling batch search so far; the single-query call has the same API
start = time.time()
for i in range(0,nq):
    neighbors, distances = searcher.search(queries[i], final_num_neighbors=10, leaves_to_search=200, pre_reorder_num_neighbors = 250)
end = time.time()

print(neighbors)
print(distances)
print("Latency (ms):", 1000000*(end - start)/nq)
'''

Recall: 0.46818
Time: 2.6787519454956055
