# Search study

Let's say you have an existing Redis database with a search index seeded. You may wish to quickly test different search method against the existing index without having to recreate data and/or recreate the index. This demo will walk you though how to set this up and get going.

# Installation

In [None]:
%pip install redis-retrieval-optimizer

# Load data

We will load our custom car dataset for this example. 

In [1]:
import json

with open('../resources/cars/car_corpus.json', 'r') as f:
    corpus = json.load(f)

with open('../resources/cars/car_queries.json', 'r') as f:
    queries = json.load(f)

with open('../resources/cars/car_qrels.json', 'r') as f:
    qrels = json.load(f)

# Create the index with redisvl

For the search_study we are assuming that the search index already exists. The cell below will create a Redis search index and populate it with our test data for example purposes but is assumed with a search study is populated and running within your data. 

Note: the demo assumes you have a instance of redis running on localhost:6379. If this is not the case, update the redis_url to direct to your running instance or start a local instance with the following command. 

`docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest`

In [8]:
from redisvl.index import SearchIndex
from redisvl.utils.vectorize import HFTextVectorizer

emb_model = HFTextVectorizer()

# define schema
car_schema = {
    "index": {
        "name": "cars",
        "prefix": "cars"
    },
    "fields": [
        {"name": "item_id", "type": "tag"},
        {"name": "text", "type": "text"},
        {"name": "make", "type": "tag"},
        {"name": "model", "type": "tag"},
        {
            "name": "vector",
            "type": "vector",
            "attrs": {
                "dims": 768,
                "distance_metric": "cosine",
                "algorithm": "FLAT",
                "datatype": "float32"
            },
        },
    ]
}

# assuming you have a redis instance running on localhost:6379
redis_url = "redis://localhost:6379"

# create index
index = SearchIndex.from_dict(car_schema, redis_url=redis_url)
index.create(overwrite=True)

embeddings = emb_model.embed_many([c["text"] for c in corpus], as_buffer=True)

# vectorize corpus data
corpus_data = [
    {
        "text": c["text"],
        "item_id": c["item_id"],
        "make": c["query_metadata"]["make"],
        "model": c["query_metadata"]["model"],
        "vector": embeddings[i]
    }
    for i, c in enumerate(corpus)
]

index.load(corpus_data)


15:15:56 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps
15:15:56 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['cars:01K1KH06CQXD7SQP42Y3APYJSD',
 'cars:01K1KH06CQ2WZYCE9HG8NQ6ND1',
 'cars:01K1KH06CQNBFN1RHJ6K76QJF5',
 'cars:01K1KH06CQPWG7BD239XEAH6EA',
 'cars:01K1KH06CQXAW3N3B5XA01HJA0',
 'cars:01K1KH06CQRDPRAPJ9954HCGCD',
 'cars:01K1KH06CQZSHNXGXBCYY87K7D',
 'cars:01K1KH06CQQVJ3PX0BE1NTWK1F',
 'cars:01K1KH06CQZYQRXHATGJW75XH5',
 'cars:01K1KH06CQXMCBGJQ6BDM2D047',
 'cars:01K1KH06CQCB466XYVGNYSKHFQ',
 'cars:01K1KH06CQYG9MKB67RXSAVZ50',
 'cars:01K1KH06CQNA1B1EVPF16E54HK',
 'cars:01K1KH06CQMZV2YP5B7N9SYB1G',
 'cars:01K1KH06CQCDTDY38KWP0PBXG4',
 'cars:01K1KH06CQY6JMBA01E0GD5WAV',
 'cars:01K1KH06CQ09JN44KGA0VAEFTC',
 'cars:01K1KH06CQBZXGNRQR5R9B018F',
 'cars:01K1KH06CQ6VHVJGP0YZMR314V',
 'cars:01K1KH06CQ7E1TA7VTKNSCFB8Z',
 'cars:01K1KH06CQQ0C5GSMZ705585W3',
 'cars:01K1KH06CQCWZ6Z9GYTW1W6EC0',
 'cars:01K1KH06CQN93WTY4FYVJ901RJ',
 'cars:01K1KH06CQ620X975V0RN0F1EA',
 'cars:01K1KH06CRH64WKQA18VAT53WV',
 'cars:01K1KH06CRJX6SDB0HPDS9T3MB',
 'cars:01K1KH06CRWEQJ8T5BWTJRWQ1C',
 'cars:01K1KH06CRB5FZ9GF95CT

# Check index created successfully

In [9]:
index.info()["num_docs"]

464

# Define search methods for search study

A search method can be anything as long as it takes a `SearchMethodInput` and returns a `SearchMethodOutput`. Below we will compare a basic vector search to a vector search with a pre-filter. 

In [10]:
from ranx import Run
from redis_retrieval_optimizer.search_methods.base import run_search_w_time
from redisvl.query import VectorQuery
from redisvl.query.filter import Tag

from redis_retrieval_optimizer.schema import SearchMethodInput, SearchMethodOutput
from redis_retrieval_optimizer.search_methods.vector import make_score_dict_vec

def vector_query(query_info, num_results: int, emb_model) -> VectorQuery:
    vector = emb_model.embed(query_info["query"], as_buffer=True)

    return VectorQuery(
        vector=vector,
        vector_field_name="vector",
        num_results=num_results,
        return_fields=["_id", "make", "model", "text"],  # update to read from env maybe?
    )

def pre_filter_query(query_info, num_results, emb_model) -> VectorQuery:
    vec = emb_model.embed(query_info["query"])
    make = query_info["query_metadata"]["make"]
    model = query_info["query_metadata"]["model"]

    filter = (Tag("make") == make) & (Tag("model") == model)

    # Create a vector query
    query = VectorQuery(
        vector=vec,
        vector_field_name="vector",
        num_results=num_results,
        filter_expression=filter,
        return_fields=["_id", "make", "model", "text"]
    )

    return query

def gather_pre_filter_results(search_method_input: SearchMethodInput) -> SearchMethodOutput:
    redis_res_vector = {}

    for key in search_method_input.raw_queries:
        query_info = search_method_input.raw_queries[key]
        query = pre_filter_query(query_info, 10, search_method_input.emb_model)
        res = run_search_w_time(
            search_method_input.index, query, search_method_input.query_metrics
        )
        score_dict = make_score_dict_vec(res, id_field_name="_id")

        redis_res_vector[key] = score_dict

    return SearchMethodOutput(
        run=Run(redis_res_vector),
        query_metrics=search_method_input.query_metrics,
    )


def gather_vector_results(search_method_input: SearchMethodInput) -> SearchMethodOutput:
    redis_res_vector = {}

    for key in search_method_input.raw_queries:
        text_query = search_method_input.raw_queries[key]
        vec_query = vector_query(text_query, 10, search_method_input.emb_model)
        res = run_search_w_time(
            search_method_input.index, vec_query, search_method_input.query_metrics
        )
        score_dict = make_score_dict_vec(res, id_field_name="_id")
        redis_res_vector[key] = score_dict
        
    return SearchMethodOutput(
        run=Run(redis_res_vector),
        query_metrics=search_method_input.query_metrics,
    )


# Run the search study

In [None]:
from redis_retrieval_optimizer.search_study import run_search_study

metrics = run_search_study(
    config_path=search_config_path,
    redis_url=redis_url,
)