In [228]:
import numpy as np
import h5py
import os
import requests
import tempfile
import time

import scann

In [3]:
with tempfile.TemporaryDirectory() as tmp:
    response = requests.get("http://ann-benchmarks.com/glove-100-angular.hdf5")
    loc = os.path.join(tmp, "glove.hdf5")
    with open(loc, 'wb') as f:
        f.write(response.content)
    
    glove_h5py = h5py.File(loc, "r")

In [4]:
list(glove_h5py.keys())

['distances', 'neighbors', 'test', 'train']

In [5]:
dataset = glove_h5py['train']
queries = glove_h5py['test']
print(dataset.shape)
print(queries.shape)

(1183514, 100)
(10000, 100)


In [6]:
normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]

In [7]:
searcher = scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product").tree(
    num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000).score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(100).build()

2023-05-02 16:04:17.226730: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 249797
2023-05-02 16:04:23.961219: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 6.734385881s.


In [8]:
def compute_recall(neighbors, true_neighbors):
    total = 0
    for gt_row, row in zip(true_neighbors, neighbors):
        total += np.intersect1d(gt_row, row).shape[0]
    return total / true_neighbors.size

In [9]:
start = time.time()
neighbors, distances = searcher.search_batched(queries)
end = time.time()

# we are given top 100 neighbors in the ground truth, so select top 10
print("Recall:", compute_recall(neighbors, glove_h5py['neighbors'][:, :10]))
print("Time:", end - start)

Recall: 0.90015
Time: 1.807617425918579


In [15]:
distances.shape

(10000, 10)

In [16]:
start = time.time()
neighbors, distances = searcher.search(queries[0], final_num_neighbors=5)
end = time.time()

print(neighbors)
print(distances)
print("Latency (ms):", 1000*(end - start))

[ 97478 846101 671078 727732 544474]
[2.5518737 2.539792  2.5383418 2.5097368 2.4656374]
Latency (ms): 0.9369850158691406


In [46]:
normalized_dataset.shape,queries[0].shape

((1183514, 100), (100,))

## For graph embeddings

In [137]:
from gensim.models import KeyedVectors

In [138]:
#loading model
model = KeyedVectors.load_word2vec_format('GGvec_model_v1.bin')

In [139]:
emb = model.vectors
normalized_dataset = emb / np.linalg.norm(emb, axis=1)[:, np.newaxis]

In [162]:
searcher = scann.scann_ops_pybind.builder(normalized_dataset, 100, "dot_product").score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(100000).build()

2023-05-03 14:01:31.590085: I scann/base/single_machine_factory_scann.cc:113] Single-machine AH training with dataset size = 24523, 8 thread(s).


In [216]:
selections_dict = {}
positive_list = ['City:London']
negative_list = ['Gender:Male']
selections_dict['positive'] = positive_list
selections_dict['negative'] = negative_list

In [217]:
def get_vector(word,model):
    return model.wv[word]

In [218]:
def get_single_vector(selections_dict):
    positive = get_vector(selections_dict['positive'],model).sum(axis = 0)
    negative = get_vector(selections_dict['negative'],model).sum(axis = 0)
    return positive-negative

In [227]:
def get_most_similar(selections_dict,model):
    query = get_single_vector(selections_dict)
    neighbors, distances = searcher.search(query, final_num_neighbors=2900000)
    return np.array(list(model.wv.vocab))[neighbors]

## Checking for speed

In [220]:
start = time.time()
ids = get_most_similar(selections_dict,model)
end = time.time()
print("Latency (ms):", 1000*(end - start))

Latency (ms): 12.449026107788086


  
  after removing the cwd from sys.path.


In [None]:
start = time.time()
p = model.most_similar_cosmul(**selections_dict, topn=1000000)
end = time.time()
print("Latency (ms):", 1000*(end - start))

Latency (ms): 44.86966133117676


## Checking for memory

In [235]:
import os
import psutil
 
# inner psutil function
def process_memory():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss
 
# decorator function
def profile(func):
    def wrapper(*args, **kwargs):
 
        mem_before = process_memory()
        result = func(*args, **kwargs)
        mem_after = process_memory()
        print("{}:consumed memory: {:,}".format(
            func.__name__,
            mem_before, mem_after, mem_after - mem_before))
 
        return result
    return wrapper

In [236]:
@profile
def get_most_similar(selections_dict,model):
    query = get_single_vector(selections_dict)
    neighbors, distances = searcher.search(query, final_num_neighbors=2900000)
    return np.array(list(model.wv.vocab))[neighbors]

In [242]:
ids = get_most_similar(selections_dict,model)

get_most_similar:consumed memory: 2,220,281,856


  
  """


In [238]:
@profile
def cosmul_similar(selections_dict,model):
    return model.most_similar_cosmul(**selections_dict, topn=1000000)

In [243]:
ids = cosmul_similar(selections_dict,model)

cosmul_similar:consumed memory: 2,219,188,224
