### Ingesting data into index

In [1]:
import os
import numpy as np
import faiss
import glob
from tqdm import tqdm

In [3]:
# Directory setup
embeddings_dir = "embeddings"
index_directory = "faiss_indices"
index_file = "PM_index.index"
index_path = os.path.join(index_directory, index_file)

# Ensure the index directory exists
if not os.path.exists(index_directory):
    os.makedirs(index_directory)

# Dimensions of the embeddings
d = 768  

# Initialize the Faiss index (Flat L2-Index)
index = faiss.IndexFlatL2(d)

# Load file names of embeddings and IDs
embeddings_files = sorted(glob.glob(os.path.join(embeddings_dir, 'embeddings_*.npy')))
id_files = sorted(glob.glob(os.path.join(embeddings_dir, 'pubmed_ids_*.npy')))

# Check if the number of embeddings and ID files match
assert len(embeddings_files) == len(id_files), "The number of embeddings and ID files must match"

# Iterate through the files and add them to the index
for emb_file, id_file in tqdm(zip(embeddings_files, id_files), desc="Processing files"):
    # Load embeddings and IDs
    embeddings = np.load(emb_file).astype('float32')
    ids = np.load(id_file).astype('int64')

    # Ensure the number of embeddings matches the number of IDs
    assert embeddings.shape[0] == len(ids), "The number of embeddings and IDs in the files must match"

    # Add to the index
    index.add(embeddings)

# Write the index to a file
faiss.write_index(index, index_path)

print(f"Index successfully written to: {index_path}")

Processing files: 10it [04:05, 24.59s/it]


Index successfully written to: faiss_indices/PM_index.index


In [4]:
index = faiss.read_index('faiss_indices/PM_index.index')

In [None]:
k = 10  # Number of nearest neighbors

query = np.random.rand(768).tolist()

distances, indices = index.search(query, k)

In [1]:
import requests
import numpy as np
import json

# URL of the Flask endpoint
url = 'http://localhost:5000/search'

# Generate a random vector of length 768
random_vector = np.random.rand(768).tolist()  # Convert numpy array directly to list

# Data for the POST request
data = {
    'queries': [random_vector]  # Ensure this is a list of lists
}

# Convert data to JSON before sending as POST request
json_data = json.dumps(data)

# Send the POST request
response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json_data)

# Output the response
print('Status Code:', response.status_code)
print('Response:', response.json())

Status Code: 200
Response: {'distances': [[348.77490234375, 348.9889221191406, 349.5247497558594, 349.7203369140625, 349.90228271484375, 349.9190979003906, 350.23382568359375, 350.36578369140625, 350.47930908203125, 350.5979309082031]], 'indices': [[470115, 1932016, 473742, 469270, 1405245, 670332, 1715754, 2382674, 1707872, 2141577]]}
