# Serverless vector-db pipeline

## Initializations

Import dependencies.

In [1]:
import argparse
import numpy as np
from vectordb.benchmarks import calculate_mult_recall
from vectordb.serverless_vectordb import ServerlessVectorDB
import csv
import json
from lithops import Storage
from io import StringIO
import time
import boto3
from pathlib import Path

Inicialize serverless-vectorDB.

In [None]:
sv_vectordb = ServerlessVectorDB(
        # General arguments
        dataset = "deep_100k",
        features = 96,          # dimension of vectors
        k_search = 10,          # count of vectors returned per lambda query  
        k_result = 10,          # count of returned vectors in the query call
        
        # Custom algorithm arguments
        num_index = 32,         # the number of indices the dataset is slit into
        k = 512,                # n_list
        n_probe = 32,
        query_batch_size = 8,   # number of indices accessed per lambda query

        # Storage
        storage_bucket = "acanadilla-vectordb-datasets",

        # Runtime
        index_mem = 10240,
        search_map_mem = 8192,
        search_reduce_mem = 2048
    )

storage = Storage()

## Vector Indexing

In this case, only full-dataset indexing is supported. Indexing vectors individually or in small batches is not available.

Each vector of dataset needs to have the following format:
```vector_id,vector```

Where __vector_id__ is an integer, and __vector__ is a space-separated list of floats (v1 v2 v3 v4 v5 ...).

__Example__: 0,-0.13469987 0.10494248 0.034127206 -0.07105535 0.051401354 0.013269722 -0.08894723 0.07330574 ...

__Important__: the indexed dataset is located in the bucket at the path: `indexes/{dataset}/blocks/{num_index}/`

In [None]:
times = sv_vectordb.indexing(f'vectors_deep_100k.csv', 128)     # Name of file with vectors in the bucket

## Querying

In a query, both batch and individual query are supported. However, individual queries are less efficient, so __batch querying is recommended__ - the larger the batch, the better the performance.

Queries return a list of `vector_id` of size `k_reasult`. The __distance__ for each vector_id is __not returned__.

The queries needs to have the following format:
```vector```

Where __vector__ is a space-separated list of floats (v1 v2 v3 v4 v5 ...).

__Example__: 0.14236236 -0.06880325 -0.12708192 0.0344065 -0.061641872 0.0716707 -0.024206704 -0.026985524 -0.021729523 ...

***
Load queries from a file located in the bucket.

In [4]:
QUERIES_FILE = "queries_deep_100k.csv"

# Load csv with vectors
storage.download_file(sv_vectordb.params.storage_bucket, QUERIES_FILE, 'queries.csv')
with open("queries.csv", "r") as f:
    csv_reader_q = csv.reader(f)

    query_vectors = []
    for lines in csv_reader_q:
        vector = lines[0].split(" ")
        vector = [float(value) for value in vector if value != '']
        query_vectors.append(vector)

***
Group queries in different batches.

In [5]:
def divide_chunks(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

BLOCK_QUERY_SIZE = 100

mult_query = list(divide_chunks(query_vectors, BLOCK_QUERY_SIZE))

***
Carry out batch queries.

In [None]:
i = 0
calculated_neighbors = []
for query in mult_query:
            
    if i == 1:  # Only one batch
        break

    smart_neighbors, querying_times = sv_vectordb.search(i, np.array(query))
    calculated_neighbors.append(smart_neighbors)
    print(f'Query block {i}: {smart_neighbors}')

    i += 1

## Query recall

Estimation of the average recall for the executed queries. To calculate it, a file containing the true neighbors for each query is used.

***
Load true neighborns from a file located in the bucket.

Each line have the following format:
```vector_ids```

Where __vector_ids__ is a comma-separated list of vector_id (123,345,10,4576,...). Therefore, it is necessary to transform the data into the correct format.

In [7]:
TRUE_FILE = "true_neighbours_deep_100k.csv"     # This file only have true neighbors of first 1000 queries

storage.download_file(sv_vectordb.params.storage_bucket, TRUE_FILE, 'true_neighbors.csv')
with open("true_neighbors.csv", "r") as f:
    csv_reader_t = csv.reader(f)
    
    true = []
    for row in csv_reader_t:
        res_ids = [int(value) for value in row if value != '']
        true.append(res_ids)

***
Calcule recall of queries.

In [8]:
def calculate_mean_mult(data):
    aux = 0
    total_len = 0
    for x in data:
        total_len += len(x)
        for y in x:
            aux += y
            
    return aux / total_len

recalls = []
for i, neighbors in enumerate(calculated_neighbors):
    index = i*BLOCK_QUERY_SIZE
    recalls.append(calculate_mult_recall(true[index:index+BLOCK_QUERY_SIZE], neighbors))

recall_mean = calculate_mean_mult(recalls)
print(f'Average recall: {recall_mean}%')

Average recall: 99.5%


## Clean environment

Delete indexed vectors (dataset)

In [9]:
prefix = f'indexes/{sv_vectordb.params.dataset}/{sv_vectordb.params.implementation}/{sv_vectordb.params.num_index}'
keys = storage.list_keys(sv_vectordb.params.storage_bucket, prefix=prefix)

response = storage.delete_objects(sv_vectordb.params.storage_bucket, keys)

Delete local temporary files

In [10]:
files = [Path("queries.csv"), Path("true_neighbors.csv")]

for f in files:
    if f.exists():
        f.unlink()
        print(f"Deleted: {f}")
    else:
        print(f"File not exist: {f}")

Deleted: queries.csv
Deleted: true_neighbors.csv
