# Search study

Let's say you have an existing Redis database with a search index seeded. You may wish to quickly test different search method against the existing index without having to recreate data and/or recreate the index. This demo will walk you though how to set this up and get going.

# Installation

In [None]:
%pip install redis-retrieval-optimizer

# Load data

We will load our custom car dataset for this example. 

In [17]:
import json

with open('../resources/cars/car_corpus.json', 'r') as f:
    corpus = json.load(f)

with open('../resources/cars/car_queries.json', 'r') as f:
    queries = json.load(f)

with open('../resources/cars/car_qrels.json', 'r') as f:
    qrels = json.load(f)

# Create the index with redisvl

For the search_study we are assuming that the search index already exists. The cell below will create a Redis search index and populate it with our test data for example purposes but is assumed with a search study is populated and running within your data. 

Note: the demo assumes you have a instance of redis running on localhost:6379. If this is not the case, update the redis_url to direct to your running instance or start a local instance with the following command. 

`docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest`

In [18]:
# assuming you have a redis instance running on localhost:6379
redis_url = "redis://localhost:6379"

In [19]:
from redisvl.index import SearchIndex
from redisvl.utils.vectorize import HFTextVectorizer

emb_model = HFTextVectorizer()

# define schema
car_schema = {
    "index": {
        "name": "cars",
        "prefix": "cars"
    },
    "fields": [
        {"name": "item_id", "type": "tag"},
        {"name": "text", "type": "text"},
        {"name": "make", "type": "tag"},
        {"name": "model", "type": "tag"},
        {
            "name": "vector",
            "type": "vector",
            "attrs": {
                "dims": 768,
                "distance_metric": "cosine",
                "algorithm": "FLAT",
                "datatype": "float32"
            },
        },
    ]
}

# create index
index = SearchIndex.from_dict(car_schema, redis_url=redis_url)
index.create(overwrite=True)

embeddings = emb_model.embed_many([c["text"] for c in corpus], as_buffer=True)

# vectorize corpus data
corpus_data = [
    {
        "text": c["text"],
        "item_id": c["item_id"],
        "make": c["query_metadata"]["make"],
        "model": c["query_metadata"]["model"],
        "vector": embeddings[i]
    }
    for i, c in enumerate(corpus)
]

index.load(corpus_data)


10:03:02 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps
10:03:02 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['cars:01K1TP9E8PXNVB5GBA84WJWBH5',
 'cars:01K1TP9E8QRZQGPB15EHRZDV59',
 'cars:01K1TP9E8Q6C2059RKP7ZDJF44',
 'cars:01K1TP9E8Q382R8AXYMSZ906RR',
 'cars:01K1TP9E8QSPRJNR1C95CTH8WQ',
 'cars:01K1TP9E8Q6GGWJPY85G458452',
 'cars:01K1TP9E8QXCJP7AXK4FEXDT6W',
 'cars:01K1TP9E8QGKKPDXGBW70GQN8D',
 'cars:01K1TP9E8Q8RVS7M8ZYHK7RZT1',
 'cars:01K1TP9E8QZ0XB9R3RG7GZMS2R',
 'cars:01K1TP9E8QZJV6T122HKFBC78F',
 'cars:01K1TP9E8QCN2BR3ZAVF0TG73Z',
 'cars:01K1TP9E8QQZE9VTFKF44VXDQY',
 'cars:01K1TP9E8Q2B76V0BBAJC6N02C',
 'cars:01K1TP9E8QMMFVNC8ZYEFY6K66',
 'cars:01K1TP9E8QD0FEGCZ16H4WRPGZ',
 'cars:01K1TP9E8QTBYDKA72S6KV4G04',
 'cars:01K1TP9E8Q61Z5EFDQ9VQNW1KP',
 'cars:01K1TP9E8QQX20W7HYF71SVZ51',
 'cars:01K1TP9E8QB3NC71A5W3WMHYRR',
 'cars:01K1TP9E8QDWBGBHS7XC6PGZPY',
 'cars:01K1TP9E8QCAGGMEMQ623PYDGK',
 'cars:01K1TP9E8QPWA2HB915ER3NKHB',
 'cars:01K1TP9E8QWJCTWS94G0VYRQS3',
 'cars:01K1TP9E8QE2DY8WP2MYENVWVD',
 'cars:01K1TP9E8Q2SH8Y749DX60ETD6',
 'cars:01K1TP9E8Q201V4JCJA28JWS1E',
 'cars:01K1TP9E8QPVS9PS7Q6MP

# Check index created successfully

In [20]:
index.info()["num_docs"]

464

# Review search study config

- index_name should point to index created above
- qrels and queries should point to the queries and set of labeled queries under test
- search methods should match with the custom methods defined below
- embedding_model should match with the one used to create the index

In [22]:
from redis_retrieval_optimizer.utils import load_search_study_config

search_study_config = load_search_study_config("search_study_config.yaml")
search_study_config

SearchStudyConfig(study_id='test-search-study', existing_index_name='cars', qrels='../resources/cars/car_qrels.json', queries='../resources/cars/car_queries.json', search_methods=['base_vector', 'pre_filter_vector'], ret_k=6, embedding_model=EmbeddingModel(type='hf', model='sentence-transformers/all-mpnet-base-v2', dim=768, embedding_cache_name='vec-cache', dtype='float32'))

# Define search methods for search study

A search method can be anything as long as it takes a `SearchMethodInput` and returns a `SearchMethodOutput`. Below we will compare a basic vector search to a vector search with a pre-filter. 

In [25]:
from ranx import Run
from redis_retrieval_optimizer.search_methods.base import run_search_w_time
from redisvl.query import VectorQuery
from redisvl.query.filter import Tag

from redis_retrieval_optimizer.schema import SearchMethodInput, SearchMethodOutput
from redis_retrieval_optimizer.search_methods.vector import make_score_dict_vec

def vector_query(query_info, num_results: int, emb_model) -> VectorQuery:
    vector = emb_model.embed(query_info["query"], as_buffer=True)

    return VectorQuery(
        vector=vector,
        vector_field_name="vector",
        num_results=num_results,
        return_fields=["_id", "make", "model", "text"],  # update to read from env maybe?
    )

def pre_filter_query(query_info, num_results, emb_model) -> VectorQuery:
    vec = emb_model.embed(query_info["query"])
    make = query_info["query_metadata"]["make"]
    model = query_info["query_metadata"]["model"]

    filter = (Tag("make") == make) & (Tag("model") == model)

    # Create a vector query
    query = VectorQuery(
        vector=vec,
        vector_field_name="vector",
        num_results=num_results,
        filter_expression=filter,
        return_fields=["_id", "make", "model", "text"]
    )

    return query

def gather_pre_filter_results(search_method_input: SearchMethodInput) -> SearchMethodOutput:
    redis_res_vector = {}

    for key in search_method_input.raw_queries:
        query_info = search_method_input.raw_queries[key]
        query = pre_filter_query(query_info, 10, search_method_input.emb_model)
        res = run_search_w_time(
            search_method_input.index, query, search_method_input.query_metrics
        )
        score_dict = make_score_dict_vec(res, id_field_name="_id")

        redis_res_vector[key] = score_dict

    return SearchMethodOutput(
        run=Run(redis_res_vector),
        query_metrics=search_method_input.query_metrics,
    )


def gather_vector_results(search_method_input: SearchMethodInput) -> SearchMethodOutput:
    redis_res_vector = {}

    for key in search_method_input.raw_queries:
        query_info = search_method_input.raw_queries[key]
        vec_query = vector_query(query_info, 10, search_method_input.emb_model)
        res = run_search_w_time(
            search_method_input.index, vec_query, search_method_input.query_metrics
        )
        score_dict = make_score_dict_vec(res, id_field_name="_id")
        redis_res_vector[key] = score_dict
        
    return SearchMethodOutput(
        run=Run(redis_res_vector),
        query_metrics=search_method_input.query_metrics,
    )


# Run the search study

In [26]:
from redis_retrieval_optimizer.search_study import run_search_study

# Note: must match with what's in the search_study_config.
SEARCH_METHOD_MAP = {
    "base_vector": gather_vector_results,
    "pre_filter_vector": gather_pre_filter_results
}

metrics = run_search_study(
    config_path="search_study_config.yaml",
    redis_url=redis_url,
    search_method_map=SEARCH_METHOD_MAP
)

Connecting to existing index: cars
Connected to index: cars with 464 objects
10:06:40 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps
10:06:40 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Running search method: base_vector
Running search method: pre_filter_vector


In [27]:
metrics

Unnamed: 0,search_method,total_indexing_time,total_index_memory_sz_mb,total_object_memory_mb,avg_query_time,recall,ndcg,f1,precision,ret_k,algorithm,ef_construction,ef_runtime,m,distance_metric,vector_data_type
0,base_vector,129.71400451660156,4.008788,0.0,0.001404,0.0,0.0,0.0,0.0,6,unknown,0,0,0,unknown,unknown
1,pre_filter_vector,129.71400451660156,4.008788,0.0,0.001072,0.0,0.0,0.0,0.0,6,unknown,0,0,0,unknown,unknown
