In [1]:
import numpy as np

def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

In [2]:
X = read_fvecs('data/sift/sift_base.fvecs')
Q = read_fvecs('data/sift/sift_query.fvecs')

In [3]:
print("Data shape: ", X.shape)
print("Query shape: ", Q.shape)

Data shape:  (1000000, 128)
Query shape:  (10000, 128)


In [4]:
from src.python.random_projection_lsh import RandomProjectionLSH

lsh = RandomProjectionLSH(n_hash_tables=32, n_projections=8)

In [5]:
Q_test = Q[:1000]

In [6]:
%%time
model = lsh.fit(X)

CPU times: user 29.7 s, sys: 2.62 s, total: 32.3 s
Wall time: 22.2 s


In [7]:
%%time
all_neighbors = model.query(Q_test)

CPU times: user 40.8 s, sys: 6.5 s, total: 47.3 s
Wall time: 44.9 s


In [None]:
def get_gt_top_k_indices(q, X, k=1000):
    q_norm = q / np.linalg.norm(q)
    X_norm = X / np.linalg.norm(X, axis=1, keepdims=True)
    cos_sims = np.dot(X_norm, q_norm)

    # reverse - higher is better
    return np.argsort(cos_sims)[-k:][::-1]

(763027,)


In [13]:
def recall(lsh_indices, gt_top_k_indices):
    lsh_set = set(lsh_indices)
    gt_set = set(gt_top_k_indices)
    
    intersection = lsh_set.intersection(gt_set)
    recall = len(intersection) / len(gt_set)
    
    return recall, len(intersection), len(gt_set)

In [15]:
first_query = Q_test[0]
lsh_indices = all_neighbors[0]
print(len(lsh_indices))

763027


In [16]:
k = len(lsh_indices)
top_k_indices = get_gt_top_k_indices(first_query, X, k)

recall(lsh_indices, top_k_indices)

(0.8073633043129536, 616040, 763027)