# Search study

Let's say you have an existing Redis database with a search index seeded. You may wish to quickly test different search method against the existing index without having to recreate data and/or recreate the index. This demo will walk you though how to set this up and get going.

# Installation

In [None]:
%pip install redis-retrieval-optimizer

In [2]:
import redis_retrieval_optimizer

redis_retrieval_optimizer.__version__

'0.4.1'

# Load data

We will load our custom car dataset for this example. 

In [7]:
import json

with open('../resources/cars/car_corpus.json', 'r') as f:
    corpus = json.load(f)

with open('../resources/cars/car_queries.json', 'r') as f:
    queries = json.load(f)

with open('../resources/cars/car_qrels.json', 'r') as f:
    qrels = json.load(f)

# Create the index with redisvl

For the search_study we are assuming that the search index already exists. The cell below will create a Redis search index and populate it with our test data for example purposes but is assumed with a search study is populated and running within your data. 

Note: the demo assumes you have a instance of redis running on localhost:6379. If this is not the case, update the redis_url to direct to your running instance or start a local instance with the following command. 

`docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest`

In [8]:
# assuming you have a redis instance running on localhost:6379
redis_url = "redis://localhost:6379"

In [9]:
from redisvl.index import SearchIndex
from redisvl.utils.vectorize import HFTextVectorizer

emb_model = HFTextVectorizer()

# define schema
car_schema = {
    "index": {
        "name": "cars",
        "prefix": "cars"
    },
    "fields": [
        {"name": "item_id", "type": "tag"},
        {"name": "text", "type": "text"},
        {"name": "make", "type": "tag"},
        {"name": "model", "type": "tag"},
        {
            "name": "vector",
            "type": "vector",
            "attrs": {
                "dims": 768,
                "distance_metric": "cosine",
                "algorithm": "FLAT",
                "datatype": "float32"
            },
        },
    ]
}

# create index
index = SearchIndex.from_dict(car_schema, redis_url=redis_url)
index.create(overwrite=True)

embeddings = emb_model.embed_many([c["text"] for c in corpus], as_buffer=True)

# vectorize corpus data
corpus_data = [
    {
        "text": c["text"],
        "item_id": c["item_id"],
        "make": c["query_metadata"]["make"],
        "model": c["query_metadata"]["model"],
        "vector": embeddings[i]
    }
    for i, c in enumerate(corpus)
]

index.load(corpus_data)


15:37:06 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps
15:37:06 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


['cars:01K3KY4XKBMD7B5VWDYMBPT8CV',
 'cars:01K3KY4XKBJBADHWYSSVKYXFA3',
 'cars:01K3KY4XKBAFQXKJA8D24XTN4V',
 'cars:01K3KY4XKB4YK5J8M2BJ3RG2ZZ',
 'cars:01K3KY4XKBJ9BEN9KA5MY5B17F',
 'cars:01K3KY4XKB1YN12H0E8RYF908V',
 'cars:01K3KY4XKBBCQRKZF2WKN35N1M',
 'cars:01K3KY4XKBCFNP86PB1X20BKP6',
 'cars:01K3KY4XKBB1TW18MW7FXQHPGV',
 'cars:01K3KY4XKBNSAD3KQKWHY8ZEZV',
 'cars:01K3KY4XKBZASFPS1QAFT8AB9M',
 'cars:01K3KY4XKB6WS6EXC6D29HGB2B',
 'cars:01K3KY4XKBG7BDE2W5096JCPV1',
 'cars:01K3KY4XKBTJPHE8DJJDZ0F25N',
 'cars:01K3KY4XKB5G1J1HVVZNEHMD1G',
 'cars:01K3KY4XKB0G2NJSFABD1C0V2V',
 'cars:01K3KY4XKBE15Y724Q1MFRT1JD',
 'cars:01K3KY4XKB9BET7ZPS9PWS9FAM',
 'cars:01K3KY4XKBKK7N32WVSACBRDAH',
 'cars:01K3KY4XKB8WN4FZQHYN7N5M3Y',
 'cars:01K3KY4XKB3NFV4KCC3ZR9S1W3',
 'cars:01K3KY4XKBS0HT7SQPHKCZJ7K1',
 'cars:01K3KY4XKB9YAVWSK2EHPKCMJW',
 'cars:01K3KY4XKB26NXA6V8NY5DFS28',
 'cars:01K3KY4XKBKRQFSYB64XT263TZ',
 'cars:01K3KY4XKBA1G1N5GV81TQBG4V',
 'cars:01K3KY4XKBAZ9F12ANSMDMXY89',
 'cars:01K3KY4XKBHR8EYYS9S8K

# Check index created successfully

In [10]:
index.info()["num_docs"]

464

# Review search study config

- index_name should point to index created above
- qrels and queries should point to the queries and set of labeled queries under test
- search methods should match with the custom methods defined below
- embedding_model should match with the one used to create the index

In [11]:
from redis_retrieval_optimizer.utils import load_search_study_config

search_study_config = load_search_study_config("search_study_config.yaml")
search_study_config

SearchStudyConfig(study_id='test-search-study', index_name='cars', qrels='../resources/cars/car_qrels.json', queries='../resources/cars/car_queries.json', search_methods=['base_vector', 'pre_filter_vector'], ret_k=3, id_field_name='_id', vector_field_name='vector', text_field_name='text', embedding_model=EmbeddingModel(type='hf', model='sentence-transformers/all-mpnet-base-v2', dim=768, embedding_cache_name='vec-cache', dtype='float32'))

# Define search methods for search study

A search method can be anything as long as it takes a `SearchMethodInput` and returns a `SearchMethodOutput`. Below we will compare a basic vector search to a vector search with a pre-filter. 

In [None]:
from ranx import Run
from redis_retrieval_optimizer.search_methods.base import run_search_w_time
from redisvl.query import VectorQuery
from redisvl.query.filter import Tag

from redis_retrieval_optimizer.schema import SearchMethodInput, SearchMethodOutput
from redis_retrieval_optimizer.search_methods.vector import make_score_dict_vec

def vector_query(query_info, num_results: int, emb_model) -> VectorQuery:
    vector = emb_model.embed(query_info["query"], as_buffer=True)

    return VectorQuery(
        vector=vector,
        vector_field_name="vector",
        num_results=num_results,
        return_fields=["item_id", "make", "model", "text"]
    )

def pre_filter_query(query_info, num_results, emb_model) -> VectorQuery:
    vec = emb_model.embed(query_info["query"])
    make = query_info["query_metadata"]["make"]
    model = query_info["query_metadata"]["model"]

    filter = (Tag("make") == make) & (Tag("model") == model)

    # Create a vector query
    query = VectorQuery(
        vector=vec,
        vector_field_name="vector",
        num_results=num_results,
        filter_expression=filter,
        return_fields=["item_id", "make", "model", "text"]
    )

    return query

def gather_pre_filter_results(search_method_input: SearchMethodInput) -> SearchMethodOutput:
    redis_res_vector = {}

    for key, query_info in search_method_input.raw_queries.items():
        # create the query
        query = pre_filter_query(query_info, search_method_input.ret_k, search_method_input.emb_model)

        # run with timing helper function
        res = run_search_w_time(
            search_method_input.index, query, search_method_input.query_metrics
        )

        # format into scores dict
        score_dict = make_score_dict_vec(res, id_field_name="item_id")

        redis_res_vector[key] = score_dict

    # return search method output
    return SearchMethodOutput(
        run=Run(redis_res_vector),
        query_metrics=search_method_input.query_metrics,
    )


def gather_vector_results(search_method_input: SearchMethodInput) -> SearchMethodOutput:
    redis_res_vector = {}

    for key, query_info in search_method_input.raw_queries.items():
        # get query
        vec_query = vector_query(query_info, search_method_input.ret_k, search_method_input.emb_model)

        # run with timing helper function
        res = run_search_w_time(
            search_method_input.index, vec_query, search_method_input.query_metrics
        )

        # format into scores dict
        score_dict = make_score_dict_vec(res, id_field_name="item_id")
        redis_res_vector[key] = score_dict
    
    # return search method output
    return SearchMethodOutput(
        run=Run(redis_res_vector),
        query_metrics=search_method_input.query_metrics,
    )


# Run the search study

In [None]:
from redis_retrieval_optimizer.search_study import run_search_study
