In [1]:
import minsearch
import os
import pandas as pd
import re
import time

from tqdm.auto import tqdm

from openai import OpenAI

In [9]:
api_key = os.environ["OPENAI_API_KEY"]
model = "gpt-4o-mini"

client = OpenAI(api_key=api_key)

In [10]:
df = pd.read_csv('../data/rag_dataset.csv')

In [11]:
df = df.fillna('')

In [12]:
documents = df.to_dict(orient='records')

In [13]:
documents[0]

{'id': 0,
 'breed_name': 'Affenpinscher',
 'history': "The word 'Affenpinscher' derives from Affe, German for 'ape' or 'monkey'; it is sometimes translated as 'Monkey Terrier', although the dog is a pinscher and not a terrier.\nThe origins of the Pinscher group of dogs are unknown. Dogs of this type, both rough-haired and smooth-haired, were traditionally kept as carriage dogs or as stable dogs, and so were sometimes known as Stallpinscher; they were capable ratters. Until the late nineteenth century, both rough-haired and smooth-haired types were known as Deutscher Pinscher, and came from the same lineage; puppies of both types could occur in the same litter.\nIn 1880 the Pinscher was recorded in the Deutschen Hundestammbuch of the Verein zur Veredelung der Hunderassen. In 1895 Ludwig Beckmann described five varieties of Pinscher – the rough- and smooth-haired Pinscher, the rough- and smooth-haired Miniature Pinscher, and the Affenpinscher. In 1895 a breed society, the Pinscher-Schnau

In [14]:
prompt_template = """
You emulate a user of our Pawfect Mate application who is interesting to addopt or buy a dog, but not sure what breed is the best for your family, activity and environment.
Formulate 5 questions this person might ask based on a provided dataset with dog breeds.
The record should contain the answer to the questions, and the questions should be complete and not too short. Use as fewer words as possible from the record. 

The record:
breed_name: {breed_name}
history: {history}
health: {health}
description: {description}
characteristics: {characteristics}
appearance: {appearance}
temperament: {temperament}

Return only valid JSON. 
Do not include Markdown code fences, explanations, or extra text.
Valid format: {{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [15]:
prompt = prompt_template.format(**documents[1])

In [18]:
def llm(prompt, model="gpt-4o-mini"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [19]:
questions = llm(prompt)

In [21]:
cleaned = re.sub(r"^```json\s*|\s*```$", "", questions.strip(), flags=re.DOTALL)

In [22]:
import json

In [23]:
json.loads(cleaned)

{'questions': ['What is the history and origin of the Afghan Hound breed?',
  'How tall and heavy do Afghan Hounds typically get?',
  "What is the grooming requirement for the Afghan Hound's coat?",
  'What is the temperament of the Afghan Hound like?',
  'How intelligent and obedient are Afghan Hounds compared to other breeds?']}

In [25]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    json_reponse = response.choices[0].message.content
    return json_reponse

In [26]:
results = {}

In [28]:
# Due to:
# {"object":"error","message":"Service tier capacity exceeded for this model.",
# "type":"service_tier_capacity_exceeded","param":null,"code":"3505"}
# I divide documents into smaller chunks

midpoint = len(documents) // 2
chunks = [documents[:midpoint], documents[midpoint:]]

for chunk_idx, chunk in enumerate(chunks, start=1):
    print(f"Processing chunk {chunk_idx} with {len(chunk)} documents...")

    for doc in tqdm(chunk):
        doc_id = doc['id']
        if doc_id in results:
            continue

        questions = generate_questions(doc)

        # Clean JSON if AI adds code fences
        questions = json.loads(questions.strip())

        # Handle {"questions": [...]} vs just [...]
        if isinstance(questions, dict) and "questions" in questions:
            questions = questions["questions"]

        results[doc_id] = questions
        time.sleep(1)
    time.sleep(30)

Processing chunk 1 with 176 documents...


  0%|          | 0/176 [00:00<?, ?it/s]

Processing chunk 2 with 177 documents...


  0%|          | 0/177 [00:00<?, ?it/s]

In [29]:
results

{0: ['What is the lifespan of an Affenpinscher compared to other dog breeds?',
  'What health issues are common in Affenpinschers that I should be aware of?',
  'What size and weight should I expect from an Affenpinscher?',
  'What grooming requirements does an Affenpinscher have, especially if I want to maintain its coat?',
  'How has the Affenpinscher historically been used, and does it make a good companion dog for families?'],
 1: ['What is the history of the Afghan Hound and how did it develop over time?',
  "What are the grooming requirements for an Afghan Hound's coat?",
  'How tall and heavy does an Afghan Hound typically get?',
  'What is the temperament of Afghan Hounds and how do they behave around people?',
  'What are the specific training challenges associated with Afghan Hounds?'],
 2: ["What is the Aidi's native history and primary role in its original environment?",
  'How well does the Aidi adapt to urban living and what are its needs as a pet?',
  'Can you describe t

In [30]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [31]:
final_results

[(0, 'What is the lifespan of an Affenpinscher compared to other dog breeds?'),
 (0,
  'What health issues are common in Affenpinschers that I should be aware of?'),
 (0, 'What size and weight should I expect from an Affenpinscher?'),
 (0,
  'What grooming requirements does an Affenpinscher have, especially if I want to maintain its coat?'),
 (0,
  'How has the Affenpinscher historically been used, and does it make a good companion dog for families?'),
 (1,
  'What is the history of the Afghan Hound and how did it develop over time?'),
 (1, "What are the grooming requirements for an Afghan Hound's coat?"),
 (1, 'How tall and heavy does an Afghan Hound typically get?'),
 (1,
  'What is the temperament of Afghan Hounds and how do they behave around people?'),
 (1,
  'What are the specific training challenges associated with Afghan Hounds?'),
 (2,
  "What is the Aidi's native history and primary role in its original environment?"),
 (2,
  'How well does the Aidi adapt to urban living and 

In [32]:
df_q = pd.DataFrame(final_results, columns=['id', 'question'])

In [None]:
# df_q.to_csv('../data/ground_truth_retrieval.csv', index=False)

### Retrieval evaluation

In [34]:
df_questions = pd.read_csv('../data/ground_truth_retrieval.csv')

In [35]:
df_questions.head()

Unnamed: 0,id,question
0,0,What is the lifespan of an Affenpinscher compa...
1,0,What health issues are common in Affenpinscher...
2,0,What size and weight should I expect from an A...
3,0,What grooming requirements does an Affenpinsch...
4,0,How has the Affenpinscher historically been us...


In [36]:
ground_truth = df_questions.to_dict(orient='records')
ground_truth[0]

{'id': 0,
 'question': 'What is the lifespan of an Affenpinscher compared to other dog breeds?'}

In [37]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1 
    
    return cnt/len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
    
    return total_score / len(relevance_total)

In [38]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [39]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [40]:
index = minsearch.Index(
    text_fields=['breed_name', 'history', 'health', 'description', 'characteristics',
       'appearance', 'temperament'],
    keyword_fields=['id']
)

In [41]:
index.fit(documents)

<minsearch.minsearch.Index at 0x1b3888021d0>

In [42]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1765 [00:00<?, ?it/s]

{'hit_rate': 0.973371104815864, 'mrr': 0.9132188947344753}

In [43]:
df_validation = df_questions[:100]
df_test = df_questions[100:]

### Finding the best parameters

In [44]:
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# from hyperopt.pyll import scope

import random

In [45]:
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [46]:
gt_val = df_validation.to_dict(orient='records')

In [47]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [48]:
param_ranges = {
    'breed_name': (0.0, 3.0),
    'history': (0.0, 3.0),
    'health': (0.0, 3.0),
    'description': (0.0, 3.0),
    'characteristics': (0.0, 3.0),
    'appearance': (0.0, 3.0),
    'temperament': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [49]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'breed_name': 1.0519134826436933,
  'history': 1.501112921482636,
  'health': 0.37981794720744166,
  'description': 0.9537711637772556,
  'characteristics': 0.9640923514228378,
  'appearance': 1.2326646408363218,
  'temperament': 0.8259837961261812},
 0.9520000000000001)

In [50]:
def minsearch_improved(query):
    boost = {
        'breed_name': 1.05,
        'history': 1.5,
        'health': 0.38,
        'description': 0.95,
        'characteristics': 0.94,
        'appearance': 1.23,
        'temperament': 0.83,
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1765 [00:00<?, ?it/s]

{'hit_rate': 0.976770538243626, 'mrr': 0.9172680875938666}

### RAG evaluation

#### LLM-as-a-Judge

In [51]:
prompt_template = """
You're a dog behaviour expert and assist families to find the best dog breed match for their preferences. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = """
breed_name: {breed_name}
history: {history}
health: {health}
description: {description}
characteristics: {characteristics}
appearance: {appearance}
temperament: {temperament}
""".strip()

In [52]:
def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [53]:
client = OpenAI(api_key=api_key)

In [54]:
def llm(prompt, model="gpt-5-nano"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [55]:
def rag(query, model="gpt-5-nano"):
    search_results = minsearch_search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [56]:
prompt_template_judge = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

### Evaluation for all

In [57]:
api_key = os.environ["OPENAI_API_KEY"]
# model = "mistral-large-latest"

client = OpenAI(api_key=api_key)

In [58]:
df_sample = df_questions.sample(n=50, random_state=1)
sample = df_sample.to_dict(orient='records')

In [59]:
evaluations = []

for record in tqdm(sample):
    
    question = record['question']
    answer_llm = rag(question)

    prompt = prompt_template_judge.format(question=question, answer_llm=answer_llm)
    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))
    time.sleep(1)

  0%|          | 0/50 [00:00<?, ?it/s]

In [60]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

In [61]:
df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])
df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

In [62]:
df_eval.head()

Unnamed: 0,record,answer,evaluation,id,question,relevance,explanation
0,"{'id': 128, 'question': 'What kind of living s...",- Not suited to a strictly kenneled lifestyle....,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",128,What kind of living situation is best suited f...,RELEVANT,The answer directly describes the suitable liv...
1,"{'id': 271, 'question': 'What are the common h...",Common health issues in Rhodesian Ridgebacks:\...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",271,What are the common health issues associated w...,RELEVANT,The answer directly lists health issues associ...
2,"{'id': 132, 'question': 'What is the historica...",- Historical origin\n - The Galgo Español is ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",132,What is the historical origin of the Galgo Esp...,RELEVANT,The answer directly addresses the historical o...
3,"{'id': 141, 'question': 'What are the essentia...",Essential hunting traits and skills of the Bla...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",141,What are the essential traits and skills this ...,RELEVANT,The answer lists hunting-relevant traits (trac...
4,"{'id': 227, 'question': 'What is the exercise ...",The Norwegian Buhund requires extensive daily ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",227,What is the exercise requirement for the Norwe...,RELEVANT,The answer directly states the exercise needs ...


In [63]:
del df_eval['record']
del df_eval['evaluation']

In [64]:
df_eval.relevance.value_counts()

relevance
RELEVANT           41
PARTLY_RELEVANT     6
NON_RELEVANT        3
Name: count, dtype: int64