In [36]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

# Context

In `make_synthetic_questions.ipynb`, we generated synthetic questions to bootstrap evaluation of the retrieval system in our hardware store's Q&A system.

This notebook shows the first step in calculating precision and recall with different retrieval parameters. We will run more advanced experiments in future notebooks after we have these baseline scores.

## Data

Here is a brief review of the data.

In [2]:
import json
import lancedb
import os
import pandas as pd
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor

pd.set_option("display.max_colwidth", 160)

db = lancedb.connect("./lancedb")
reviews_table = db.open_table("reviews")
reviews_table.to_pandas().head()

Unnamed: 0,id,product_title,product_description,review,vector
0,0,Cordless Drill,"This lightweight cordless drill offers a powerful performance with adjustable speed settings, perfect for various drilling tasks. Its ergonomic design provi...",I've been using this cordless drill for over a year now on various home improvement projects and it hasn't let me down once. The adjustable speed settings m...,"[0.010204377, 0.020264247, 0.028945748, -0.016862787, -0.0016243078, -0.00039148735, 0.008103476, 0.0564242, 0.030168494, 0.006058154, 0.022832016, 0.002653..."
1,1,Cordless Drill,"This lightweight cordless drill offers a powerful performance with adjustable speed settings, perfect for various drilling tasks. Its ergonomic design provi...","This cordless drill is a gem! It's extremely lightweight, which is a big plus when working on overhead projects. The ergonomic grip is comfortable, even dur...","[0.02122662, 0.019065969, -0.025120541, -0.013474393, 0.022568123, 0.016881574, -0.023482246, 0.07916057, 0.04140853, -0.016703498, 0.005677646, 0.000275460..."
2,2,Cordless Drill,"This lightweight cordless drill offers a powerful performance with adjustable speed settings, perfect for various drilling tasks. Its ergonomic design provi...","I bought this drill for my DIY projects around the house, and it was one of the best investments. It's got a nice weight balance that's neither too heavy no...","[-0.01990977, 0.026872814, -0.010510256, -0.03597374, 0.01943203, -0.031029142, -0.014833793, 0.08374763, 0.018440722, -0.0010569973, 0.03501826, -0.0019079..."
3,3,Cordless Drill,"This lightweight cordless drill offers a powerful performance with adjustable speed settings, perfect for various drilling tasks. Its ergonomic design provi...","As a professional contractor, I rely on durable and efficient tools. This cordless drill exceeds my expectations. It's powerful enough to drive screws into ...","[0.020162972, -0.00025912662, 0.013526903, -0.0020457166, 0.014242676, 0.0125442315, -0.029237522, 0.05240917, 0.013551166, -0.012641286, 0.048818167, -0.00..."
4,4,Cordless Drill,"This lightweight cordless drill offers a powerful performance with adjustable speed settings, perfect for various drilling tasks. Its ergonomic design provi...",This cordless drill packs a punch for its size. I've used it for everything from installing shelves to building a deck. The torque settings are easy to adju...,"[-0.029100286, 0.00055905356, -0.024061428, -0.004820546, 0.024710461, 0.020910667, -0.007528784, 0.0724557, 0.009393279, 0.00724557, 0.019506395, 0.0220081..."


In [3]:
with open("synthetic_eval_dataset.json", "r") as f:
    synthetic_questions = json.load(f)
synthetic_questions

[{'question': 'How long does the battery last on this cordless drill before needing a recharge?',
  'answer': 'The battery lasts a good 4-5 hours of continuous use before needing a recharge.',
  'chunk_id': '0'},
 {'question': 'Can this cordless drill handle tough materials?',
  'answer': 'Yes, it is versatile enough to handle tough materials like concrete.',
  'chunk_id': '0'},
 {'question': 'Is this cordless drill easy to handle for overhead work?',
  'answer': "Yes, it's extremely lightweight, which is a big plus when working on overhead projects.",
  'chunk_id': '1'},
 {'question': 'What features come with this drill?',
  'answer': 'The set includes two batteries, a charger, and a handy carrying case.',
  'chunk_id': '1'},
 {'question': "How's the weight balance on this cordless drill?",
  'answer': "The cordless drill has a nice weight balance that's neither too heavy nor too light.",
  'chunk_id': '2'},
 {'question': 'What about the speed control on this drill?',
  'answer': 'The

## Set Up Evaluation

Load the evaluation questions into a structured format.

In [9]:
from pydantic import BaseModel


class EvalQuestion(BaseModel):
    question: str
    answer: str
    chunk_id: str

In [10]:
from pydantic import TypeAdapter

eval_questions = TypeAdapter(list[EvalQuestion]).validate_python(synthetic_questions)

Build a simple search function

In [11]:
def run_simple_request(q: EvalQuestion, n_return_vals=5):
    results = (
        reviews_table.search(q.question).select(["id"]).limit(n_return_vals).to_list()
    )
    return [str(q.chunk_id) == str(r["id"]) for r in results]

Now do the benchmarking. For simplicity, we just compare retrieval sizes with a simple semantic search in this cell.

In [14]:
def score(hits):
    # This implementation assumes
    n_retrieval_requests = len(hits)
    total_retrievals = sum(len(l) for l in hits)
    true_positives = sum(sum(sublist) for sublist in hits)
    precision = true_positives / total_retrievals if total_retrievals > 0 else 0
    recall = true_positives / n_retrieval_requests if n_retrieval_requests > 0 else 0
    return {"precision": precision, "recall": recall}


def score_simple_search(n_to_retrieve: List[int]) -> Dict[str, float]:
    # parallelize to speed this up 5-10X
    with ThreadPoolExecutor() as executor:
        hits = list(
            executor.map(lambda q: run_simple_request(q, n_to_retrieve), eval_questions)
        )
    return score(hits)


k_to_retrieve = [5, 10, 20]
scores = pd.DataFrame([score_simple_search(n) for n in k_to_retrieve])
scores["n_retrieved"] = k_to_retrieve
scores

Unnamed: 0,precision,recall,n_retrieved
0,0.099206,0.496029,5
1,0.068785,0.687847,10
2,0.044003,0.880064,20


If you have Cohere set up, you can compare these results to the results with a reranker.

If you aren't familiar with rerankers, don't worry. We will cover them in upcoming lessons.

In [38]:
import cohere
cohere_api_key = os.environ["COHERE_API_KEY"]

In [30]:
def run_reranked_request(q: EvalQuestion, n_return_vals=5, n_to_rerank=50) -> List[bool]:
    # First, get more results than we need
    initial_results = reviews_table.search(q.question) \
        .select(["id", "review"]) \
        .limit(n_to_rerank) \
        .to_list()
    
    # Prepare texts for reranking
    texts = [r["review"] for r in initial_results]
    
    # Rerank using Cohere
    co = cohere.Client(cohere_api_key)
    reranked = co.rerank(
        query=q.question,
        documents=texts,
        top_n=n_return_vals
    )
    
    # Map reranked results back to original IDs
    reranked_ids = [initial_results[r.index]["id"] for r in reranked.results]
    return [str(q.chunk_id) == str(r) for r in reranked_ids]

def score_reranked_search(n_to_retrieve: List[int], n_to_rerank: int = 50) -> Dict[str, float]:
    with ThreadPoolExecutor() as executor:
        hits = list(executor.map(
            lambda q: run_reranked_request(q, n_to_retrieve, n_to_rerank), 
            eval_questions
        ))
    return score(hits)

In [39]:

k_to_retrieve = [5, 10, 20]
reranked_scores = pd.DataFrame([score_reranked_search(n) for n in k_to_retrieve])
reranked_scores["n_retrieved"] = k_to_retrieve
print(reranked_scores)


InternalServerError: status_code: 500, body: {'message': 'internal server error, this has been reported to our developers. id 847f40c3-73e7-4006-bed3-adde87c6a905'}

In [41]:
reranked_scores

NameError: name 'reranked_scores' is not defined