# Installation

In [1]:
!pip install hnswlib
!pip install "pymongo[srv]"

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp311-cp311-linux_x86_64.whl size=2383978 sha256=c8594c09da712481783794864a13f2d0d81b6dfe8c9ad9a733fcea76b18e613b
  Stored in directory: /root/.cache/pip/wheels/ea/4e/27/39aebca9958719776e36fada290845a7ef10f053ad70e22ceb
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0
Collecting pymongo[srv]
  Downloading pymongo-4.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
[0mCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-

In [2]:
import hnswlib
import numpy as np
from sklearn.preprocessing import normalize
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

# Testing with dummy values

In [None]:
# generate dummy users based on parameter template
def generate_user_vectors(num_users: int) -> np.ndarray:
    vectors = []
    for _ in range(num_users):
        about = np.random.rand(5) * 2   # Upweight if needed
        interests = np.random.randint(0, 2, size=8) * 1.0
        personality = np.random.rand(7)
        full_vector = np.concatenate([about, interests, personality])
        vectors.append(full_vector)

    vectors = np.array(vectors, dtype=np.float32)
    return normalize(vectors, norm='l2')  # Only if using cosine space


n_dim = 20
n_indexes = 5000
max_capacity = 1000000

user_vectors = generate_user_vectors(n_indexes)
user_ids = np.arange(n_indexes)

# create the hnsw index
index = hnswlib.Index(space='cosine', dim = n_dim)
index.init_index(max_elements=max_capacity, ef_construction=350, M=32)
index.add_items(user_vectors, user_ids)

# setting the index efficiency
index.set_ef(100)

[0.09613387 0.288657   0.2008019  0.5224634  0.42147306 0.27466187
 0.27466187 0.         0.         0.27466187 0.27466187 0.
 0.         0.21690525 0.00996336 0.04336625 0.10323887 0.07502989
 0.22226481 0.00572302]


Testing with a dummy user

In [None]:
def calculate_match_percentage(vec1, vec2):
    # cosine_similarity expects 2D arrays, so reshape 1D vectors
    cos_sim = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    return cos_sim * 100

def interactive_loop():
    while True:
        cmd = input("\nEnter command (add / delete / query / count / exit): ").strip().lower()

        if cmd == "add":
            try:
                num_to_add = int(input("How many users to add? "))
                new_vectors = generate_user_vectors(num_to_add)
                start_id = index.get_current_count()
                new_ids = np.arange(start_id, start_id + num_to_add)
                index.add_items(new_vectors, new_ids)
                print(f"Added {num_to_add} users with IDs from {start_id} to {start_id + num_to_add - 1}")
            except Exception as e:
                print("Error adding users:", e)

        elif cmd == "delete":
            try:
                del_id = int(input("Enter user ID to delete: "))
                index.mark_deleted(del_id)
                print(f"Marked user ID {del_id} as deleted.")
            except Exception as e:
                print("Error deleting user:", e)

        elif cmd == "query":
            try:
                print("Enter 20-dimensional vector values separated by spaces:")
                vec_str = input()
                vec = np.array([float(x) for x in vec_str.split()], dtype=np.float32)
                if len(vec) != n_dim:
                    print(f"Vector must be of dimension {n_dim}.")
                    continue
                k = int(input("How many nearest neighbors to find? "))
                labels, distances = index.knn_query(vec, k=k)
                print(f"Top {k} matching user IDs, distances, and cosine similarity:")
                for rank, (label, dist) in enumerate(zip(labels[0], distances[0]), start=1):
                  matched_vector = user_vectors[label]
                  percentage = calculate_match_percentage(vec, matched_vector)
                  print(f"{rank}. ID: {label}, Distance: {dist:.4f}, Match %: {percentage:.2f}%")


            except Exception as e:
                print("Error querying index:", e)

        elif cmd == "count":
            print(f"Current number of elements in index (including deleted): {index.get_current_count()}")

        elif cmd == "exit":
            print("Exiting interactive session.")
            break

        else:
            print("Unknown command. Please enter add / delete / query / count / exit.")

# Run the interactive session
interactive_loop()


# Testing with dummy values in database

Import libraries

In [3]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from urllib.parse import quote_plus

Connect to database

In [4]:
username = "rupakyeware"
password = "Nanu@2003"
username_enc = quote_plus(username)
password_enc = quote_plus(password)

uri = f"mongodb+srv://{username_enc}:{password_enc}@main.hf8tqhh.mongodb.net/?retryWrites=true&w=majority&appName=main"

In [5]:
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


Insert dummy users

In [6]:
db = client["hnsw_db"]
collection = db["users"]

In [None]:
def generate_vector():
    # Generate vector like your example:
    # - first 5 floats between 0 and 1 (About)
    about = np.random.rand(5)
    # - next 8 binary ints (0 or 1) (Interests)
    interests = np.random.randint(0, 2, size=8)
    # - last 7 floats between 0 and 1 (Personality)
    personality = np.random.rand(7)
    vector = np.concatenate([about, interests, personality])
    return vector.tolist()  # Convert to list for MongoDB

def insert_dummy_users(n=100):
    documents = []
    for user_id in range(1, n+1):
        user_doc = {
            "user_id": user_id,
            "vector": generate_vector()
        }
        documents.append(user_doc)

    result = collection.insert_many(documents)
    print(f"Inserted {len(result.inserted_ids)} dummy users.")

insert_dummy_users(10000)

Inserted 10000 dummy users.


Fetch the users and load them into index

In [8]:
def fetch_all_users():
    users = list(collection.find({}))
    user_ids = []
    vectors = []
    for user in users:
        user_ids.append(user['user_id'])
        vec = np.array([float(x) for x in user['vector']], dtype=np.float32)
        vectors.append(vec)
    vectors = np.array(vectors, dtype=np.float32)
    return np.array(user_ids), vectors

user_ids, user_vectors = fetch_all_users()
n_dim = user_vectors.shape[1]

index = hnswlib.Index(space='cosine', dim=n_dim)
max_capacity = 10**6  # or higher if you expect more users

index.init_index(max_elements=max_capacity, ef_construction=200, M=16)
index.add_items(user_vectors, user_ids)

index.set_ef(100)  # ef query parameter

In [22]:
def interactive_loop():
    while True:
        cmd = input("\nEnter command (add / delete / query / count / exit): ").strip().lower()

        if cmd == "add":
            try:
              print(f"Enter {n_dim}-dimensional vector values separated by spaces:")
              vec_str = input()
              vec = np.array([float(x) for x in vec_str.split()],dtype=np.float32)
              if len(vec) != n_dim:
                print(f"Vector must be of dimension {n_dim}. User not added.")
                continue

              new_user_id = int(input("\nEnter ID: "))

              index.add_items(vec.reshape(1, -1), np.array([new_user_id]))
            except Exception as e:
                print("Error adding users:", e)

        elif cmd == "delete":
            try:
              id_to_delete = int(input("\nEnter the ID: "))
              index.mark_deleted(id_to_delete)
              print(f"\nUser ID {id_to_delete} marked as inactive")
            except Exception as e:
                print("Error deleting user:", e)

        elif cmd == "query":
            try:
                print(f"Enter {n_dim}-dimensional vector values separated by spaces:")
                vec_str = input()
                vec = np.array([float(x) for x in vec_str.split()], dtype=np.float32)
                if len(vec) != n_dim:
                    print(f"Vector must be of dimension {n_dim}.")
                    continue

                k = int(input("How many nearest neighbors to find? "))
                labels, distances = index.knn_query(vec, k=k)

                print(f"Top {k} matching user IDs, distances:")
                for rank, (label, dist) in enumerate(zip(labels[0], distances[0]), start=1):
                    print(f"{rank}. ID: {label}, Distance: {dist:.4f}")
                    # print(f"Matched Vector: {matched_vec}")

            except Exception as e:
                print("Error querying index:", e)

        elif cmd == "count":
            print(f"Current number of elements in index (including deleted): {index.get_current_count()}")

        elif cmd == "exit":
            print("Exiting interactive session.")
            break

        else:
            print("Unknown command. Please enter add / delete / query / count / exit.")

interactive_loop()


Enter command (add / delete / query / count / exit): count
Current number of elements in index (including deleted): 21

Enter command (add / delete / query / count / exit): delete

Enter the ID:1234

User ID 1234 marked as inactive

Enter command (add / delete / query / count / exit): query
Enter 20-dimensional vector values separated by spaces:
0.39 0.10 0.74 0.32 0.58 0 0 0 1 1 1 0 1 0 0.24 0.66 0.82 0.45 0.35 0.12  
How many nearest neighbors to find? 5
Top 5 matching user IDs, distances:
1. ID: 14, Distance: 0.0981
2. ID: 12, Distance: 0.1793
3. ID: 8, Distance: 0.2310
4. ID: 16, Distance: 0.2313
5. ID: 18, Distance: 0.3204

Enter command (add / delete / query / count / exit): add
Enter 20-dimensional vector values separated by spaces:
0.39 0.10 0.74 0.32 0.58 0 0 0 1 1 1 0 1 0 0.24 0.66 0.82 0.45 0.35 0.12  

Enter ID:1234

Enter command (add / delete / query / count / exit): count
Current number of elements in index (including deleted): 21

Enter command (add / delete / query / 

In [13]:
arr = [1, 4, 2, 7]
print(arr[arr.index(7)])

7
