{DSPy.RM Migration - TBD}

In [None]:
#Querying ColBERTv2 

import requests
import os
from typing import Any, Dict, List, Optional, Union
from dspy import RM, Retrieve, Embedder
from dspy.primitives.prediction import Prediction

def colbert_search_function(query: str, k: int, url: str, post_requests: bool = False) -> List[Dict[str, Any]]:
    if post_requests:
        headers = {"Content-Type": "application/json; charset=utf-8"}
        payload = {"query": query, "k": k}
        res = requests.post(url, json=payload, headers=headers, timeout=10)
    else:
        payload = {"query": query, "k": k}
        res = requests.get(url, params=payload, timeout=10)
    
    res.raise_for_status()
    topk = res.json()["topk"][:k]
    topk = [{**doc, "long_text": doc.get("text", "")} for doc in topk]
    return topk

def colbert_result_formatter(results: List[Dict[str, Any]]) -> Prediction:
    passages = [doc["long_text"] for doc in results]
    return Prediction(passages=passages)

colbert_url = "http://20.102.90.50:2017/wiki17_abstracts"

colbert_rm = RM(
    search_function=colbert_search_function,
    result_formatter=colbert_result_formatter,
    url=colbert_url,
    post_requests=False
)

retrieve = Retrieve(rm=colbert_rm, k=10)
query_text = "Example query text"
results = retrieve(query_text)
print(results.passages)

In [None]:
#Querying Databricks Mosaic AI Vector Search 

#client setup
databricks_token = os.environ.get("DATABRICKS_TOKEN")
databricks_endpoint = os.environ.get("DATABRICKS_HOST")
databricks_client = WorkspaceClient(host=databricks_endpoint, token=databricks_token)

#custom logic for querying and sorting the docs
def databricks_search_function(
    query,
    k,
    index_name,
    columns,
    query_type='ANN',
    filters_json=None,
    client=None
):
    results = client.vector_search_indexes.query(
        index_name=index_name,
        query_type=query_type,
        query_text=query,
        num_results=k,
        columns=columns,
        filters_json=filters_json,
    ).as_dict()

    items = []
    col_names = [column["name"] for column in results["manifest"]["columns"]]
    for data_row in results["result"]["data_array"]:
        item = {col_name: val for col_name, val in zip(col_names, data_row)}
        items.append(item)
    sorted_docs = sorted(items, key=lambda x: x["score"], reverse=True)
    return sorted_docs

def databricks_result_formatter(results) -> Prediction:
    passages = [doc['some_text_column'] for doc in results] 
    return Prediction(passages=passages)

databricks_rm = RM(
    search_function=databricks_search_function,
    result_formatter=databricks_result_formatter,
    client=databricks_client,
    index_name='your_index_name',
    columns=['id', 'some_text_column'],
    filters_json=None
)

retrieve = Retrieve(rm=databricks_rm, k=3)
results = retrieve("Example query text")
print(results.passages)

In [None]:
#Querying Deeplake Vector Store

embedder = Embedder()

deeplake_vectorstore_name = 'vectorstore_name'
deeplake_client = deeplake.VectorStore(
    path=deeplake_vectorstore_name,
    embedding_function=embedder
)

def deeplake_search_function(query, k, client=None):
    results = client.search(query, k=k)
    return results

def deeplake_result_formatter(results) -> Prediction:
    passages = [doc['text'] for doc in results['documents']]
    return Prediction(passages=passages)


deeplake_rm = RM(
    embedder=embedder,
    search_function=deeplake_search_function,
    result_formatter=deeplake_result_formatter,
    client=deeplake_client
)

retrieve = Retrieve(rm=deeplake_rm, k=3)
results = retrieve("some text")
print(results.passages)

TBD...