# Grid study from existing index

In this example we will perform a grid study based on an existing redis index.

For a study to evaluate retrieval metrics we need the following data:
- corpus: the total data we wish to search against.
    - the corpus can be an existing index where the objects are the records stored in the database
- queries: the queries we wish to execute against the index
- qrels: the labels that match a query to the expected records to be returned from the corpus.

Once this data is gathered we can define a study configuration. This example uses the `grid_study_config.yaml` file. A grid study evaluates all options as opposed to a bayesian optimization study which will use a heuristic method to select trial configurations to evaluate. A grid study is appropriate when there are a small number of settings you want to test. A bayesian study is better when the number of permutations you wish to test would exceed a reasonable amount of time. 

In [1]:
from uuid import uuid4
import yaml

from pydantic import BaseModel

class AdditionalField(BaseModel):
    name: str
    type: str

class IndexSettings(BaseModel):
    name: str = "ret-opt"
    from_existing: bool = False
    algorithm: str = "flat"
    distance_metric: str = "cosine"
    vector_data_type: str = "float32"
    vector_dim: int = 368
    ef_construction: int = 0
    ef_runtime: int = 0
    m: int = 0
    additional_fields: list[AdditionalField] = []

class EmbeddingModel(BaseModel):
    type: str
    model: str
    dim: int
    embedding_cache_name: str = ""
    embedding_cache_redis_url: str = "redis://localhost:6379/0"

class GridStudyConfig(BaseModel):
    study_id: str = str(uuid4())
    
    # index settings
    index_settings: IndexSettings

    # data
    corpus: str = ""
    qrels: str
    queries: str

    vector_field_name: str = "vector"
    text_field_name: str = "text"
    primary_id_field_name: str = "_id" # this is what links corpus to qrels

    index_settings: IndexSettings

    embedding_models: list[EmbeddingModel]
    search_methods: list[str]
    ret_k: int = 6


def load_grid_study_config(config_path: str) -> GridStudyConfig:
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    return GridStudyConfig(**config)

def schema_from_settings(index_settings: IndexSettings, additional_schema_fields=None):
    schema = {
        "index": {"name": "optimize", "prefix": "ret-opt"},
        "fields": [
            {"name": "_id", "type": "tag"},
            {"name": "text", "type": "text"},
            {"name": "title", "type": "text"},
            {
                "name": "vector",
                "type": "vector",
                "attrs": {
                    "dims": index_settings.vector_dim,
                    "distance_metric": index_settings.distance_metric,
                    "algorithm": index_settings.algorithm,
                    "datatype": index_settings.vector_data_type,
                    "ef_construction": index_settings.ef_construction,
                    "ef_runtime": index_settings.ef_runtime,
                    "m": index_settings.m,
                },
            },
        ],
    }

    # define a custom search method to do pre-filtering etc.
    if additional_schema_fields:
        for field in additional_schema_fields:
            schema["fields"].append(field)  # type: ignore

    return schema

In [2]:
study_config = load_grid_study_config("grid_study_config.yaml")

In [11]:
study_config

GridStudyConfig(study_id='b3e57584-b925-4ee4-8c5f-34d334578345', index_settings=IndexSettings(name='optimize', from_existing=True, algorithm='flat', distance_metric='cosine', vector_data_type='float32', vector_dim=368, ef_construction=0, ef_runtime=0, m=0, additional_fields=[]), corpus='data/nfcorpus_corpus.json', qrels='data/nfcorpus_qrels.json', queries='data/nfcorpus_queries.json', vector_field_name='vector', text_field_name='text', primary_id_field_name='_id', embedding_models=[EmbeddingModel(type='hf', model='sentence-transformers/all-MiniLM-L6-v2', dim=384, embedding_cache_name='vec-cache', embedding_cache_redis_url='redis://localhost:6379')], search_methods=['bm25', 'vector'], ret_k=6)

In [3]:
import os
import utils
from redisvl.index import SearchIndex
from dotenv import load_dotenv

from redis import Redis
from redis.commands.json.path import Path
import eval_beir

def get_last_index_settings(redis_url):
    client = Redis.from_url(redis_url)
    return client.json().get("ret-opt:last_schema")


def set_last_index_settings(redis_url, index_settings):
    client = Redis.from_url(redis_url)
    client.json().set("ret-opt:last_schema", Path.root_path(), index_settings)


def check_recreate_schema(index_settings, last_index_settings):
    if not last_index_settings:
        return True
    if last_index_settings and index_settings != last_index_settings:
        return True
    return False

load_dotenv()


def init_index_from_grid_settings(grid_study_config: GridStudyConfig) -> SearchIndex:
    redis_url = os.environ.get("REDIS_URL")

    index_settings = grid_study_config.index_settings.model_dump()
    embed_settings = grid_study_config.embedding_models[0]
    index_settings["embedding"] = embed_settings.model_dump()

    if grid_study_config.index_settings.from_existing:
        print(f"Connecting to existing index: {grid_study_config.index_settings.name}")

        index = SearchIndex.from_existing(
            name=grid_study_config.index_settings.name,
            redis_url=redis_url,
        )
        print(f"Connected to index: {index.name} with {index.info()['num_docs']} objects")
        print(f"From existing, assuming {grid_study_config.embedding_models[0].model} embedding model")
        if embed_settings.dim != index.schema.fields[study_config.vector_field_name].attrs.dims:
            raise ValueError(
                f"Embedding model dimension {emb_model.dims} does not match index dimension {index.schema.fields[study_config.vector_field_name].attrs['dims']}"
            )
        set_last_index_settings(redis_url, index_settings)
    else:
        last_index_settings = get_last_index_settings(study_config.redis_url)
        recreate = check_recreate_schema(index_settings, last_index_settings)

        schema = schema_from_settings(
            grid_study_config.index_settings,
            additional_schema_fields=grid_study_config.index_settings.additional_fields,
        )

        index = SearchIndex.from_dict(schema, redis_url=redis_url)

        if recreate:
            emb_model = utils.get_embedding_model(grid_study_config.embedding_models[0])
            print("Recreating: loading corpus from file")
            corpus = utils.load_json(grid_study_config.corpus)
            # corpus processing functions should be user defined
            corpus_data = eval_beir.process_corpus(corpus, emb_model)

            index.load(corpus_data)

        return index

  from tqdm.autonotebook import tqdm


10:33:28 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2




10:33:29 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
import search_methods
import utils
from ranx import Qrels, Run, evaluate

SEARCH_METHOD_MAP = {
    "bm25": search_methods.bm25.gather_bm25_results,
    "rerank": search_methods.rerank.gather_rerank_results,
    "lin_combo": search_methods.lin_combo.gather_lin_combo_results,
    "vector": search_methods.vector.gather_vector_results,
    "weighted_rrf": search_methods.weighted_rrf.gather_weighted_rrf,
}

config_path = "grid_study_config.yaml"

# def run_grid_study(config_path: str):
grid_study_config = load_grid_study_config(config_path)
# if from_exiting don't load corpus

# load queries and qrels
queries = utils.load_json(grid_study_config.queries)
qrels = Qrels(utils.load_json(grid_study_config.qrels))

# get or create index
index = init_index_from_grid_settings(grid_study_config)

metrics = []

for i, embedding_model in enumerate(grid_study_config.embedding_models):
    if i > 0:
        # assuming that you didn't pass the same embedding model twice like a fool
        print("Recreating index with new embedding model")
        index_settings = grid_study_config.index_settings.model_dump()
        index_settings["embedding"] = embedding_model.model_dump()

        # TODO: be able to dump existing index corpus to file automatically which shouldn't be too hard
        print("If using multiple embedding models assuming there is a json version of corpus available.")
        print("Recreating: loading corpus from file")
        emb_model = utils.get_embedding_model(embedding_model)
        corpus = utils.load_json(grid_study_config.corpus)
        # corpus processing functions should be user defined
        corpus_data = eval_beir.process_corpus(corpus, emb_model)
        index.load(corpus_data)

    # check if matches with last index settings
    emb_model = utils.get_embedding_model(embedding_model)

    for search_method in study_config.search_methods:
        print(f"Running search method: {search_method}")
        # get search method to try
        search_fn = SEARCH_METHOD_MAP[search_method]
        trial_results = search_fn(queries, index, emb_model)

        run = Run(trial_results)

        ndcg = evaluate(qrels, run, metrics=["ndcg"])
        recall = evaluate(qrels, run, metrics=["recall"])
        f1 = evaluate(qrels, run, metrics=["f1"])
        precision = evaluate(qrels, run, metrics=["precision"])

        trial_metrics = {
            "ndcg": ndcg,
            "recall": recall,
            "f1": f1,
            "precision": precision,
            "total_indexing_time": 0,
        }

        metrics.append(trial_metrics)

10:33:33 sentence_transformers.cross_encoder.CrossEncoder INFO   Use pytorch device: mps
Connecting to existing index: optimize
Connected to index: optimize with 3633 objects
From existing, assuming sentence-transformers/all-MiniLM-L6-v2 embedding model
10:33:34 sentence_transformers.SentenceTransformer INFO   Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
10:33:35 sentence_transformers.SentenceTransformer INFO   Use pytorch device_name: mps




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Running search method: bm25
failed for PLAIN-2, Do Cholesterol Statin Drugs Cause Breast Cancer?: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-12, Exploiting Autophagy to Live Longer: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-23, How to Reduce Exposure to Alkylphenols Through Your Diet: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-33, What’s Driving America’s Obesity Problem?: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-44, Who Should be Careful About Curcumin?: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-56, Foods for Glaucoma: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-68, What is Actually in Chicken Nuggets?: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-78, What Do Meat Purge and Cola Have in Common?: error: 'NoneType' object has no attribute 'query'
failed for PLAIN-91, Chronic Headaches and Pork Parasites: error: 'NoneType

ValueError: max() arg is an empty sequence

In [26]:
index.schema.fields['vector'].attrs.dims

384