# Build and Search on Large Dataset with f16 Memory Optimization

This notebook demonstrates:
1. Loading large f32 dense dataset in batches using mmap (memory mapping) to avoid filling the memory.
2. Converting to f16/u16 format (halves the used memory).
3. Building dense f16 HNSW index.
4. Single query search and batch search.

In [None]:
import numpy as np
import os
from kannolo import DensePlainHNSWf16

## Step 1: Dataset Configuration

In [None]:
# Dataset configuration
data_folder = ""  # Path to your dataset folder
dataset_path = os.path.join(data_folder, "dataset.npy") # 1D array of f32 of len num_vectors * dim
queries_path = os.path.join(data_folder, "queries.npy") # 2D array of f32 of shape (num_queries, dim)

# Load dataset info to get dimensions
dataset_mmap_info = np.load(dataset_path, mmap_mode='r')
num_vectors, dim = dataset_mmap_info.shape
del dataset_mmap_info

# Load queries info
queries_info = np.load(queries_path, mmap_mode='r')
num_queries = queries_info.shape[0]
del queries_info

print(f"Dataset: {num_vectors:,} vectors of dimension {dim}")
print(f"Queries: {num_queries:,} vectors")
print(f"Dataset path: {dataset_path}")
print(f"Queries path: {queries_path}")

## Step 2: Load Dataset in Batches and Convert to f16/u16

In [None]:
# Load dataset using memory mapping to avoid loading all into memory
dataset_mmap = np.load(dataset_path, mmap_mode='r')
print(f"Memory-mapped dataset shape: {dataset_mmap.shape}")

# Process in batches to convert f32 -> f16 -> u16
batch_size = 10000
num_batches = (num_vectors + batch_size - 1) // batch_size

# Pre-allocate u16 array for the entire dataset
dataset_u16 = np.empty((num_vectors, dim), dtype=np.uint16)

print(f"Processing {num_batches} batches of size {batch_size}")

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, num_vectors)
    
    # Load batch from memory-mapped array
    batch_f32 = dataset_mmap[start_idx:end_idx]
    
    # Convert f32 -> f16 -> u16 bit representation
    batch_f16 = batch_f32.astype(np.float16)
    batch_u16 = batch_f16.view(np.uint16)
    
    # Store in pre-allocated array
    dataset_u16[start_idx:end_idx] = batch_u16
    
    if (i + 1) % 2 == 0 or i == num_batches - 1:
        print(f"Processed batch {i + 1}/{num_batches}")

print(f"\nOriginal f32 dataset: {dataset_mmap.nbytes / 1024 / 1024:.1f} MB")
print(f"Converted u16 dataset: {dataset_u16.nbytes / 1024 / 1024:.1f} MB")
print(f"Memory savings: {((dataset_mmap.nbytes - dataset_u16.nbytes) / dataset_mmap.nbytes) * 100:.1f}%")

# Close memory map
del dataset_mmap

## Step 3: Build Dense f16 HNSW Index

In [None]:
# Build the f16 index from u16 data
print("Building HNSW index from f16 data...")

# Flatten the u16 array for the index builder
data_flat = dataset_u16.flatten()

# Build index
index_f16 = DensePlainHNSWf16.build_from_array(
    data_flat, # Flattened u16 data array
    dim, # Dimension of the vectors
    m=32, # Number of neighbors per node (doubled at ground layer). Higher values improve accuracy but increase memory and build time.
    ef_construction=200, # Controls index quality/speed trade-off during construction. Higher values improve accuracy but increase build time and memory usage.
    metric="ip" # "l2" for Euclidean, "ip" for Inner Product 
)

print("✅ HNSW f16 index built successfully")
print(f"Index contains {num_vectors} vectors")

## Step 4: Load Queries and Perform Single Search

In [None]:
# Load queries
queries_f32 = np.load(queries_path)
print(f"Loaded {queries_f32.shape[0]} queries of dimension {queries_f32.shape[1]}")

# Single query search - query ID 15
query_id = 15 # Take a random query ID
single_query = queries_f32[query_id]

print(f"\nPerforming single search for query ID {query_id}")

# Search with single query
k = 10 # Number of results to retrieve
ef_search = 100 # Controls accuracy/speed trade-off during search. Higher values improve accuracy but increase search time.

distances, ids = index_f16.search(
    single_query,
    k,
    ef_search
)

print(f"\nSingle query results (k={k}):")
print(f"Distances: {distances}")
print(f"Document IDs: {ids}")

## Step 5: Batch Search with All Queries

In [None]:
# Batch search with all queries
print(f"Performing batch search with {queries_f32.shape[0]} queries")

batch_distances, batch_ids = index_f16.search_batch(
    queries_path,
    k,
    ef_search
)

print(f"Result shape - distances: {batch_distances.shape}, ids: {batch_ids.shape}")

# Verify consistency between single and batch search for query 15
batch_results_for_single_query = batch_ids[k * query_id : k *(query_id+1)]

ids_match = np.array_equal(ids, batch_results_for_single_query)

print(f"\nConsistency check for query {query_id}:")
print(f"IDs match: {ids_match}")

if ids_match:
    print("✅ Single and batch search results are consistent!")
else:
    print("❌ Results differ between single and batch search")