In [None]:
import lancedb
from sentence_transformers import SentenceTransformer

In [None]:
LANCE_DB_LOC = "/code/rag-gain/.lancedb"
LANCE_DB_TABLENAME = "all-mpnet-base-v2_384"
SENTS_EMBEDDER_MODEL = "all-mpnet-base-v2"  

In [None]:
db = lancedb.connect(LANCE_DB_LOC)
table = db.open_table(LANCE_DB_TABLENAME)

### Search Vector DB, similarity search

First retrieve rough search of 20 closest vectors to query embedding;

Then rerank using cross-encoder.

In [None]:
query_model = SentenceTransformer(SENTS_EMBEDDER_MODEL, device="cpu")
def search(query, top_k = 20):
    """
    Search query in table
    args:
        query : str
        top_k : int

    return : pd.DataFrame
    """
    query_vector = query_model.encode(query, convert_to_numpy=True)

    search_results = table.search(query_vector).limit(top_k)
    return search_results

In [None]:
query_examples_for_test =  [" * Rules on artificial intelligence in healthcare\n",
                            " * Artificial intelligence in agriculture\n",
                            " * Policies in data privacy\n",
                            " * AI and labour market\n"]

print("[EXAMPLES]:\n")
for query in query_examples_for_test:
    print(query)

print("[QUERY]: Enter query to vector DB ->\n")
query = str(input())
print(f"[USER QUERY]:\n{query}\n")

search_results = search(query, top_k = 20).to_pandas().dropna(subset = "text").reset_index(drop=True)
print(f"[Vector DB search]:\n")
for t in range(len(search_results.text)):
    print(search_results.text[t])
    

In [None]:
search_results = search(query, top_k = 20).to_pandas().dropna(subset = "text").reset_index(drop=True)

In [None]:
search_results["old_similarity_rank"] = search_results.sort_values("_distance", ascending=False).index+1
search_results

### Rerank
Use rought search from previous step and rerank query with each of topK results with heavy cross-encoder (more precise pair reranking), return top K


In [None]:
import torch
from sentence_transformers import CrossEncoder


reranker_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"

cross_encoder_model = CrossEncoder(reranker_model_name, device="cuda")

def rerank(query, search_results, K : int = 5):
    """
    Rerank search results
    args:
        query : str - query
        search_results : pd.DataFrame - vector search results
        K : int - number of reranked results

    return : pd.DataFrame
    """

    query_retrieve_comb = [[query, sent] for sent in search_results["text"]]
    search_results["_distance_reranked"] = cross_encoder_model.predict(query_retrieve_comb, activation_fct=torch.nn.Sigmoid())
    topk = search_results.sort_values("_distance_reranked", ascending=False).head(K)
    return topk

new_df = rerank(query, search_results)

print(f"[Reranked Vector DB search]:\n")

for t in new_df.text:
    print(t)