In [None]:
import hnswlib
import numpy as np
#https://zilliz.com/learn/learn-hnswlib-graph-based-library-for-fast-anns

In [2]:
# Initialize the HNSW index
dim = 128  # Dimensionality of embeddings
num_elements = 1000  # Number of elements to index

# Declare index
p = hnswlib.Index(space='cosine', dim=dim)
# Initialize index
p.init_index(max_elements=num_elements, ef_construction=200, M=16)

# Generate some random embeddings to simulate table embeddings
np.random.seed(42)
data_embeddings = np.random.rand(num_elements, dim).astype(np.float32)

# Add embeddings to the index
ids = np.arange(num_elements)
p.add_items(data_embeddings, ids)

# Controlling the recall by setting ef
p.set_ef(50)

In [5]:
# Simulate a query embedding
query_embedding = np.random.rand(dim).astype(np.float32)

# Perform the initial search
initial_k = 10  # Number of closest elements to find
initial_labels, distances = p.knn_query(query_embedding, k=initial_k)

print("Initial query results (IDs):", initial_labels)
print("Initial distances:", distances)

Initial query results (IDs): [[701 838  39  35 227 608 386 829  46 143]]
Initial distances: [[0.18405348 0.19224834 0.19564259 0.1965757  0.19673234 0.19941866
  0.20007324 0.20098096 0.2027269  0.20346034]]


In [6]:
# Extract the embeddings corresponding to the initial search results
# Flatten the initial_labels array and extract unique IDs to avoid duplicates in case of multiple queries
unique_initial_ids = np.unique(initial_labels.flatten())

# Create a refined subset of embeddings based on the initial query results
refined_embeddings = data_embeddings[unique_initial_ids]

# Initialize a new HNSW index for the refined subset
p_refined = hnswlib.Index(space='cosine', dim=dim)
num_elements_refined = len(unique_initial_ids)  # Number of elements in the refined subset

# Initialize the refined index with parameters suited for the smaller dataset
p_refined.init_index(max_elements=num_elements_refined, ef_construction=100, M=16)

# Add the refined subset of embeddings to the new index
# Use the unique_initial_ids as the ids for the refined embeddings to maintain a reference to the original dataset
p_refined.add_items(refined_embeddings, unique_initial_ids)

# Adjust ef for the refined search
p_refined.set_ef(10)

# Perform a follow-up query using the refined index
labels_refined, distances_refined = p_refined.knn_query(query_embedding, k=5)

print("Follow-up query results (IDs):", labels_refined)


Follow-up query results (IDs): [[701 838  39  35 227]]
