In [23]:
# --- 1. Setup and Imports ---

import pandas as pd
import numpy as np
import json
from tqdm.auto import tqdm
import dotenv
import minsearch
from elasticsearch import Elasticsearch
from openai import OpenAI

dotenv.load_dotenv("../.env")
client = OpenAI()

In [24]:
# --- 2. Data Loading and Preprocessing ---

df = pd.read_csv('../data/recipes_clean.csv')

# Ensure that every recipe/document can be uniquely identified to track which recipe a question or answer belongs to
if 'id' not in df.columns:
    df['id'] = range(len(df))
documents = df.to_dict(orient='records')

In [25]:
# --- 3. Ground Truth Generation (if not already present) ---

prompt_template = """
You emulate a user of our recipe assistant application.
Formulate 5 questions this user might ask based on a provided recipe.
Make the questions specific to ingredients, cooking methods, 
cooking duration (prep/cook time), or dietary information in this recipe.
Do NOT mention the recipe name in the question.
The record should contain the answer to the questions, 
and the questions should be complete and not too short.
Use as few words as possible from the record.

The record:

Recipe: {recipe_name}
Cuisine: {cuisine_type}
Main Ingredients: {main_ingredients}
Instructions: {instructions}
Dietary Info: {dietary_restrictions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

# Only run if you need to regenerate ground truth
results = {}
for i, doc in enumerate(tqdm(documents[:100])):
    doc_id = doc.get('id', i)
    if doc_id in results:
        continue
    try:
        questions_raw = generate_questions(doc)
        questions = json.loads(questions_raw)
        results[doc_id] = questions['questions']
    except (json.JSONDecodeError, KeyError):
        continue
final_results = []
for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))
df_results = pd.DataFrame(final_results, columns=['id', 'question'])
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

df_gt = pd.read_csv('../data/ground-truth-retrieval.csv')
ground_truth = df_gt.to_dict(orient='records')

  0%|          | 0/100 [00:00<?, ?it/s]

Run terminal command if the container already exists:

`docker start elasticsearch`

Run terminal command:

`docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.13.4`

Or:

`docker run -d --name elasticsearch \
  -p 9200:9200 \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "ES_JAVA_OPTS=-Xms512m -Xmx1g" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.4`

And check if Elasticsearch is up:

`curl http://localhost:9200`

In [26]:
# --- 4. Minsearch and Elasticsearch Setup (from rag-flow.ipynb) ---

# Minsearch
index = minsearch.Index(
    text_fields=['recipe_name', 'main_ingredients', 'all_ingredients', 'instructions', 
                 'cuisine_type', 'dietary_restrictions'],
    keyword_fields=['meal_type', 'difficulty_level']
)
index.fit(documents)

def minsearch_search(query, boost=None, num_results=10):
    if boost is None:
        boost = {'main_ingredients': 4.0, 'all_ingredients': 5.0, 'instructions': 3.0,
                 'cuisine_type': 1.0, 'dietary_restrictions': 2.0}
    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=num_results
    )
    return results

# Elasticsearch
es_client = Elasticsearch('http://localhost:9200')
index_name = "recipes"
# 10 results for more robust retrieval evaluation
def elasticsearch_search(query, num_results=10):
    search_query = {
        "size": num_results,
        "query": {
            "multi_match": {
                "query": query,
                "fields": [
                    "recipe_name",
                    "main_ingredients^4",
                    "all_ingredients^5",
                    "instructions^3",
                    "cuisine_type",
                    "dietary_restrictions^2"
                ],
                "type": "best_fields"
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [None]:
# --- 5. Retrieval Evaluation: Hit Rate and MRR ---

def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score += 1 / (rank + 1)
                break
    return total_score / len(relevance_total)

def evaluate_retrieval(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = str(q['id'])
        results = search_function(q)  # Pass the whole dict
        relevance = [str(d.get('id')) == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [29]:
print("Evaluating Minsearch retrieval...")
metrics_minsearch = evaluate_retrieval(ground_truth, lambda q: minsearch_search(q['question'], num_results=10))
print("Minsearch:", metrics_minsearch)

Evaluating Minsearch retrieval...


  0%|          | 0/500 [00:00<?, ?it/s]

Minsearch: {'hit_rate': 0.206, 'mrr': 0.10825555555555563}


In [30]:
# Re-index: send each document with 'id' to Elasticsearch
for doc in documents:
    es_client.index(index="recipes", document=doc)
    
# Look for 'id' field in the indexed documents
print(es_client.search(index="recipes", body={"size": 1, "query": {"match_all": {}}})['hits']['hits'][0]['_source'])

{'recipe_name': 'Spaghetti Carbonara', 'cuisine_type': 'Italian', 'meal_type': 'Dinner', 'difficulty_level': 'Medium', 'prep_time_minutes': 15, 'cook_time_minutes': 20, 'servings': 4, 'main_ingredients': 'Spaghetti, Eggs, Pancetta, Parmesan', 'all_ingredients': 'Spaghetti, Eggs, Pancetta, Parmesan, Black Pepper, Olive Oil, Salt', 'dietary_restrictions': 'Contains gluten, dairy, pork', 'instructions': 'Boil spaghetti until al dente. Fry pancetta until crispy. Whisk eggs with grated Parmesan. Toss hot spaghetti with pancetta and egg mixture off heat. Serve immediately with black pepper.', 'nutritional_info': 'Calories: 520, Protein: 22g, Carbs: 60g, Fat: 22g', 'id': 0}


In [32]:
print("Evaluating Elasticsearch retrieval...")
metrics_es = evaluate_retrieval(ground_truth, lambda q: elasticsearch_search(q['question'], num_results=10))
print("Elasticsearch:", metrics_es)

Evaluating Elasticsearch retrieval...


  0%|          | 0/500 [00:00<?, ?it/s]

Elasticsearch: {'hit_rate': 0.232, 'mrr': 0.10885634920634935}


**Minsearch:**

* hit_rate = 0.206: For 20.6% of the questions, the exact ground truth recipe was found in the top 10 results.
* mrr = 0.108: On average, the ground truth recipe appeared lower in the ranking (higher is better; 1.0 means always ranked #1, 0.0 means never found).

**Elasticsearch:**

* hit_rate = 0.232: For 23.2% of the questions, the exact ground truth recipe was found in the top 10 results.
* mrr = 0.109: On average, the ground truth recipe was ranked similarly to Minsearch.

---

**Interpretation:**

These lower hit rate and MRR values are expected because the user queries focus on ingredients, cooking methods, and dietary information, not on recipe names. In this setting, many recipes may be plausible answers for a given query, and the "ground truth" recipe is just one of several valid possibilities. Therefore, these metrics are a strict measure and may underestimate the true usefulness of the retrieval system for users.

For ingredient-based or attribute-based search, user satisfaction often depends on retrieving any relevant recipe, not necessarily the exact ground truth. To better assess real-world performance, we will consider supplementing these metrics with relevance-based evaluation (e.g., LLM-as-judge) to capture how well the retrieved recipes actually match

In [37]:
# --- 6. Parameter Optimization for Minsearch ---

import random

param_ranges = {
    'recipe_name': (0.0, 1.0),
    'main_ingredients': (0.0, 4.0),
    'all_ingredients': (0.0, 5.0),
    'instructions': (0.0, 3.0),
    'cuisine_type': (0.0, 1.0),
    'dietary_restrictions': (0.0, 2.0)
}

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')
    for _ in range(n_iterations):
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            current_params[param] = random.uniform(min_val, max_val)
        current_score = objective_function(current_params)
        if current_score > best_score:
            best_score = current_score
            best_params = current_params
    return best_params, best_score

gt_val = df_gt.sample(n=50, random_state=42).to_dict(orient='records')

def objective(boost_params):
    def search_function(q):
        # q is a dict with keys 'id' and 'question'
        return minsearch_search(q['question'], boost=boost_params, num_results=10)
    results = evaluate_retrieval(gt_val, search_function)
    return results['mrr']

best_boost, best_score = simple_optimize(param_ranges, objective, n_iterations=20)

print("Best Minsearch boost params:", best_boost)
print("Best validation MRR:", best_score)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Best Minsearch boost params: {'recipe_name': 0.04466476234708505, 'main_ingredients': 1.8510064727585926, 'all_ingredients': 2.3127319210696817, 'instructions': 1.6169840103279864, 'cuisine_type': 0.9784550595609097, 'dietary_restrictions': 1.3399640700511697}
Best validation MRR: 0.09688888888888889


**How to interpret the parameter optimization output:**

The "Best Minsearch boost params" are the weights for each field that gave the highest mean reciprocal rank (MRR) on the validation set. A higher weight means that field contributed more to matching queries to recipes. The "Best validation MRR" shows how well the retrieval ranked the ground truth recipe on average. 

A low MRR is expected for ingredient-based queries, since many recipes may be plausible answers and the metric only rewards finding the exact ground truth. The optimal parameters found may not always outperform your defaults due to randomness, small validation size, or the strictness of the metric. For a fuller picture, we will supplement with relevance-based evaluation.

This cell generates answers using the RAG pipeline from the rag-flow notebook.

In [38]:
# --- 7. RAG Pipeline Evaluation (LLM answer quality) ---

def build_prompt(query, search_results):
    entry_template = """
Recipe: {recipe_name}
Cuisine: {cuisine_type}
Meal Type: {meal_type}
Difficulty: {difficulty_level}
Prep Time: {prep_time_minutes} minutes
Cook Time: {cook_time_minutes} minutes
Main Ingredients: {main_ingredients}
Instructions: {instructions}
Dietary Info: {dietary_restrictions}
""".strip()
    context = "\n\n".join([entry_template.format(**doc) for doc in search_results])
    prompt_template = """
You are an expert chef and culinary assistant. Answer the question based on the content from 
our recipe database. Use only the facts from the context when answering the question.

CONTEXT:
{context}

QUESTION: {question}

Provide recipe recommendations with brief explanations of why they match the requested ingredients.
If exact ingredients aren't available, suggest the closest matches and mention any substitutions needed.
""".strip()
    return prompt_template.format(context=context, question=query)

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def rag_minsearch(question):
    search_results = minsearch_search(question, boost=best_boost, num_results=5)
    prompt = build_prompt(question, search_results)
    answer = llm(prompt)
    return answer

def rag_elasticsearch(question):
    search_results = elasticsearch_search(question, num_results=5)
    prompt = build_prompt(question, search_results)
    answer = llm(prompt)
    return answer

This cell evaluates the quality of the answers generated by the RAG pipeline. It samples questions, generates answers using the RAG functions, and then asks the LLM to judge the relevance of each answer.
The LLM then classifys each answer as "RELEVANT", "PARTLY_RELEVANT", or "NON_RELEVANT" with explanation.

In [39]:
# --- 8. LLM-as-Judge Evaluation (RAG answer quality) ---

prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

sample = df_gt.sample(n=50, random_state=1).to_dict(orient='records')
evaluations_minsearch = []
evaluations_es = []

print("Evaluating RAG (Minsearch) with LLM-as-judge...")
for record in tqdm(sample):
    question = record['question']
    answer_llm = rag_minsearch(question)
    prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
    evaluation = llm(prompt)
    try:
        evaluation = json.loads(evaluation)
    except Exception:
        evaluation = {"Relevance": "ERROR", "Explanation": evaluation}
    evaluations_minsearch.append({
        "id": record['id'],
        "question": question,
        "answer": answer_llm,
        "relevance": evaluation.get("Relevance"),
        "explanation": evaluation.get("Explanation")
    })

print("Evaluating RAG (Elasticsearch) with LLM-as-judge...")

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag_elasticsearch(question)
    prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
    evaluation = llm(prompt)
    try:
        evaluation = json.loads(evaluation)
    except Exception:
        evaluation = {"Relevance": "ERROR", "Explanation": evaluation}
    evaluations_es.append({
        "id": record['id'],
        "question": question,
        "answer": answer_llm,
        "relevance": evaluation.get("Relevance"),
        "explanation": evaluation.get("Explanation")
    })

df_eval_minsearch = pd.DataFrame(evaluations_minsearch)
df_eval_es = pd.DataFrame(evaluations_es)
df_eval_minsearch.to_csv('../data/rag-eval-minsearch.csv', index=False)
df_eval_es.to_csv('../data/rag-eval-elasticsearch.csv', index=False)

print("Minsearch RAG relevance proportions:")
print(df_eval_minsearch['relevance'].value_counts(normalize=True))
print("Elasticsearch RAG relevance proportions:")
print(df_eval_es['relevance'].value_counts(normalize=True))

Evaluating RAG (Minsearch) with LLM-as-judge...


  0%|          | 0/50 [00:00<?, ?it/s]

Evaluating RAG (Elasticsearch) with LLM-as-judge...


  0%|          | 0/50 [00:00<?, ?it/s]

Minsearch RAG relevance proportions:
relevance
RELEVANT           0.78
PARTLY_RELEVANT    0.22
Name: proportion, dtype: float64
Elasticsearch RAG relevance proportions:
relevance
RELEVANT           0.78
PARTLY_RELEVANT    0.22
Name: proportion, dtype: float64


In [40]:
# --- 9. Summary ---

print("\n=== RETRIEVAL METRICS ===")
print("Minsearch:", metrics_minsearch)
print("Elasticsearch:", metrics_es)
print("\n=== RAG LLM-as-Judge (proportion RELEVANT) ===")
print("Minsearch:", (df_eval_minsearch['relevance'] == 'RELEVANT').mean())
print("Elasticsearch:", (df_eval_es['relevance'] == 'RELEVANT').mean())

print("\nAll evaluation results saved to CSV in ../data/")


=== RETRIEVAL METRICS ===
Minsearch: {'hit_rate': 0.206, 'mrr': 0.10825555555555563}
Elasticsearch: {'hit_rate': 0.232, 'mrr': 0.10885634920634935}

=== RAG LLM-as-Judge (proportion RELEVANT) ===
Minsearch: 0.78
Elasticsearch: 0.78

All evaluation results saved to CSV in ../data/


**How to interpret the LLM-as-Judge RAG evaluation output:**

The table shows the proportions of answers classified as "RELEVANT" or "PARTLY_RELEVANT" by the LLM-as-judge for both Minsearch and Elasticsearch RAG pipelines.  
- **RELEVANT (0.78):** 78% of generated answers were judged fully relevant to the user's question.
- **PARTLY_RELEVANT (0.22):** 22% were judged partially relevant.

This indicates that, for ingredient- and attribute-based queries, the majority of answers generated by the RAG pipeline are highly relevant to user needs, regardless of the retrieval backend. This relevance-based evaluation provides a more realistic measure of user experience than strict retrieval metrics alone.