In [1]:
import pandas as pd
import json
from tqdm.auto import tqdm
from openai import OpenAI
import os
import torch
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from pinecone import ServerlessSpec
import time
import requests
from groq import Groq
from rank_bm25 import BM25Okapi
import numpy as np



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv

# Load the environment variables from the .env file
load_dotenv(dotenv_path="../.env")
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") 
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)


openrouter_client = OpenAI(
  api_key = OPENROUTER_API_KEY,
  base_url = "https://openrouter.ai/api/v1",
)

groq_client = Groq(
  api_key=GROQ_API_KEY
)

### Data Generation

In [5]:
df = pd.read_csv('../data/clean.csv')
documents = df.to_dict(orient='records')

In [6]:
documents[0]

{'name': 'Doro Wat',
 'country': 'Ethiopia',
 'ingredients': 'Chicken, onions, garlic, ginger, berbere spice mix, niter kibbeh',
 'instructions': 'In a pot, sauté onions, garlic, and ginger in niter kibbeh until soft. Add berbere spice mix, cook for a few minutes, then add chicken. Cook until the chicken is tender.',
 'meal_type': 'Main',
 'spice_level': 'High',
 'cooking_time_(minutes)': 90,
 'vegetarian': 'No',
 'main_cooking_method': 'Stewing',
 'serving_temperature': 'Hot',
 'how_to_make': 'Start by preparing niter kibbeh, a spiced clarified butter. Then sauté onions, garlic, and ginger in the niter kibbeh. Add berbere spice mix and stir for a few minutes. Add chicken pieces, cover, and cook until the chicken is thoroughly cooked.',
 'id': 'f2ff4980-fda2-4bfe-8a05-aca63be38345'}

In [7]:
prompt_template = """
You emulate a user of our Chief assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:


name: {name},
ingredients: {ingredients},
instructions: {instructions},
meal_type: {meal_type},
spice_level: {spice_level},
cooking_time_(minutes): {cooking_time_(minutes)},
vegetarian: {vegetarian},
main_cooking_method: {main_cooking_method},
serving_temperature: {serving_temperature},
how_to_make: {how_to_make}


Provide the output in parsable JSON without using code blocks and don't add anything else:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [7]:
prompt = prompt_template.format(**documents[0])


In [8]:
def llm(prompt):
    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct:free",
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [23]:
questions = llm(prompt)


In [24]:
questions

'{"questions": ["What is the specific spice mix called in this recipe that adds high spice level?", "How long does it take to cook Doro Wat according to the instructions?", "Is Doro Wat suitable for vegetarians?", "What is the main cooking method for this Ethiopian dish?", "At what temperature should Doro Wat be served?"]}'

In [25]:
json.loads(questions)


{'questions': ['What is the specific spice mix called in this recipe that adds high spice level?',
  'How long does it take to cook Doro Wat according to the instructions?',
  'Is Doro Wat suitable for vegetarians?',
  'What is the main cooking method for this Ethiopian dish?',
  'At what temperature should Doro Wat be served?']}

In [28]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model="meta-llama/llama-3.1-8b-instruct:free",
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [29]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']




100%|██████████| 166/166 [06:26<00:00,  2.33s/it]


In [30]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))


In [31]:
final_results[0]


('f2ff4980-fda2-4bfe-8a05-aca63be38345',
 'What is the recommended spice level for the Doro Wat dish?')

In [32]:
final_results[-1]

('36d849e4-35c1-4848-84e2-987fd1b33b9d',
 'What is the recommended temperature at which Mitarashi Dango should be served?')

In [33]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])


In [34]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [35]:
df_results.head()

Unnamed: 0,id,question
0,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the recommended spice level for the Do...
1,f2ff4980-fda2-4bfe-8a05-aca63be38345,How long does it take to cook the Doro Wat dish?
2,f2ff4980-fda2-4bfe-8a05-aca63be38345,Is the Doro Wat dish suitable for a vegetarian...
3,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the main cooking method used for the D...
4,f2ff4980-fda2-4bfe-8a05-aca63be38345,Should the Doro Wat dish be served hot or cold?


In [42]:
df[["id", "name"]].head()

Unnamed: 0,id,name
0,f2ff4980-fda2-4bfe-8a05-aca63be38345,Doro Wat
1,78c8ff6c-af26-4012-a637-af925fda17f7,Injera
2,3437bb23-0c42-4548-9165-f31f586b968f,Sushi
3,ec101289-7120-4818-99ec-40345acf99b8,Tacos
4,8395a5cd-7bd2-45f7-a19d-c4a232d02fa4,Paella


### Retrieval Evaluation

In [10]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')


In [11]:
df_question.head()


Unnamed: 0,id,question
0,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the recommended spice level for the Do...
1,f2ff4980-fda2-4bfe-8a05-aca63be38345,How long does it take to cook the Doro Wat dish?
2,f2ff4980-fda2-4bfe-8a05-aca63be38345,Is the Doro Wat dish suitable for a vegetarian...
3,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the main cooking method used for the D...
4,f2ff4980-fda2-4bfe-8a05-aca63be38345,Should the Doro Wat dish be served hot or cold?


In [12]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]


{'id': 'f2ff4980-fda2-4bfe-8a05-aca63be38345',
 'question': 'What is the recommended spice level for the Doro Wat dish?'}

In [13]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [7]:
pc = Pinecone(api_key=PINECONE_API_KEY)


spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

index_name = 'semantic-search-4'

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 166}},
 'total_vector_count': 166}

In [15]:
def get_all_documents(batch_size=10):
    documents = []
    total_vectors = index.describe_index_stats()['total_vector_count']
    # print(f"Total vectors in index: {total_vectors}")
    
    random_vector = np.random.rand(384).tolist()  # Assuming 384 is the dimension of your vectors
    
    # Query all vectors
    query_response = index.query(
        vector=random_vector,
        top_k=total_vectors,
        include_metadata=True
    )
    
    # print(f"Retrieved {len(query_response['matches'])} vectors")
    
    for match in query_response['matches']:
        # print(f"Vector ID: {match.id}")
        # print(f"Vector metadata: {match.metadata}")
        
        if 'text' in match.metadata:
            documents.append(match.metadata['text'])
        else:
            print(f"Warning: 'text' not found in metadata for vector {match.id}")
    
    # print(f"Total documents retrieved: {len(documents)}")
    return documents


def keyword_search(query, documents, top_k=15):
    tokenized_corpus = [doc.split() for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    top_n = np.argsort(bm25_scores)[::-1][:top_k]
    return [(idx, bm25_scores[idx]) for idx in top_n]

def query_pinecone(query, top_k=5):
    xq = model.encode(query).tolist()
    xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
    results = []
    for match in xc.matches:
        result = {
            "id": match.id,
            "score": match.score,
            "metadata": match.metadata
        }
        
        results.append(result)
    # print(results)
    
    return results




def hybrid_query_pinecone(query, documents, top_k=15, alpha=0.8):
    # Vector search
    xq = model.encode(query).tolist()
    vector_results = index.query(vector=xq, top_k=top_k, include_metadata=True)
    
    # Keyword search
    keyword_results = keyword_search(query, documents, top_k=top_k)
    
    # Combine results
    combined_results = {}
    for match in vector_results.matches:
        combined_results[match.id] = {
            "id": match.id,
            "vector_score": match.score,
            "keyword_score": 0,
            "metadata": match.metadata
        }
    
    for idx, score in keyword_results:
        if idx in combined_results:
            combined_results[idx]["keyword_score"] = score
        else:
            combined_results[idx] = {
                "id": idx,
                "vector_score": 0,
                "keyword_score": score,
                "metadata": None  # You might want to fetch metadata for these results
            }
    
    # Calculate hybrid score
    for result in combined_results.values():
        result["hybrid_score"] = alpha * result["vector_score"] + (1 - alpha) * result["keyword_score"]
    
    # Sort by hybrid score and return top results
    sorted_results = sorted(combined_results.values(), key=lambda x: x["hybrid_score"], reverse=True)[:top_k]
    
    return [result for result in sorted_results if result["hybrid_score"] > 0.2]

def evaluate(ground_truth, search_function):
    relevance_total = []
    count = 0

    for q in tqdm(ground_truth):
        doc_id = q['id']
        # print(doc_id)
        # try:
        #     results = search_function(q)
        #     count += 1
        # except:
        #     pass
        results = search_function(q)
        count += 1
        # print(results)
        relevance = [d['id'] == doc_id for d in results]
        # print(relevance)
        relevance_total.append(relevance)
        # break

    # print(count)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

#### Only Vector Search

In [14]:
evaluate(ground_truth, lambda q: query_pinecone(q['question']))


100%|██████████| 822/822 [05:29<00:00,  2.49it/s]

822





{'hit_rate': 0.8880778588807786, 'mrr': 0.848337388483374}

- {'hit_rate': 0.8880778588807786, 'mrr': 0.848337388483374}


#### Hybrid Search

In [23]:
all_documents = get_all_documents()
print(f"Length of all_documents: {len(all_documents)}")
print("First 5 documents:")
print(all_documents[:5])  # Print the first 5 documents to see their content


Length of all_documents: 166
First 5 documents:
['Biryani India Rice, chicken, saffron, spices, yogurt Layer rice with marinated chicken and spices, cook until fragrant. Main High 75 No Simmering Hot Layer partially cooked rice with marinated chicken, saffron, and spices, cook on low heat until rice is fully cooked and aromatic. 0af546e3-ef7f-4848-9d1d-b01f2c99553a', 'Stuffed Peppers Spain Bell peppers, rice, ground beef, tomatoes, spices Stuff peppers with rice and beef mixture, bake until peppers are tender. Main Medium 60 No Baking Hot Mix cooked rice with ground beef and tomatoes, stuff into hollowed bell peppers, bake until peppers are tender. 8276e8fd-3bc6-44ac-b6c6-ce78985c7099', 'Pasta Primavera Italy Pasta, tomatoes, zucchini, bell peppers, garlic Cook pasta with sautéed vegetables, serve with cheese. Main  20 Yes Boiling Hot Boil pasta until al dente, sauté tomatoes, zucchini, and bell peppers with garlic, toss with pasta and Parmesan. b305c0d9-8fc3-42a2-a90f-bb5efd40667a', '

In [24]:
evaluate(ground_truth, lambda q: hybrid_query_pinecone(q['question'], all_documents))

100%|██████████| 822/822 [04:57<00:00,  2.77it/s]


{'hit_rate': 0.3746958637469586, 'mrr': 0.06387102016664079}

### RAG Evaluation

In [57]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks and make sure it is json parsable and nothing else is added
to the below structure:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [58]:
len(ground_truth)


822

In [50]:
ground_truth[0]["question"]

'What is the recommended spice level for the Doro Wat dish?'

In [43]:
GROQ_API_KEY = os.getenv('GROQ_API_KEY')



In [59]:

def format_dish_info(dish):
    return "\n".join([f"{key}: {value}" for key, value in dish['metadata'].items() if value])

def query_openrouter(prompt):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}"
    }
    
    data = {
        "model": "meta-llama/llama-3.1-8b-instruct:free",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 150,
        "temperature": 0.0
    }
    
    response = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, json=data)
    # print(response.json())
    return response.json()['choices'][0]['message']['content']

def qa_function(question):
    # Query Pinecone
    results = query_pinecone(question)
    # print(results)
    if not results:
        return "I'm sorry, I couldn't find any relevant information to answer your question."
    
    # Format the dish information
    all_dish_info = "\n\n".join([format_dish_info(dish) for dish in results])

    # print(all_dish_info)
    
    # Create the prompt
    prompt = f"""
    Based on the following information about a dish, please answer the question: {question}

    Dish information:
    {all_dish_info}

    Answer:
    """
    
    # Use OpenRouter to generate an answer
    response = llm(prompt)
    
    return response

def llm(prompt, model="llama-3.1-70b-versatile"):

    client = Groq(
        api_key=GROQ_API_KEY
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        model=model,
    )

    # print(chat_completion.choices[0].message.content)
    return chat_completion.choices[0].message.content

# Test the QA function
question = ground_truth[0]["question"]
answer = qa_function(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What is the recommended spice level for the Doro Wat dish?
Answer: The recommended spice level for the Doro Wat dish is High.


In [60]:
prompt = prompt2_template.format(question=question, answer_llm=answer)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the recommended spice level for the Doro Wat dish?
Generated Answer: The recommended spice level for the Doro Wat dish is High.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks and make sure it is json parsable and nothing else is added
to the below structure:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [61]:
df_sample = df_question.sample(n=200, random_state=1)

In [62]:
df_sample

Unnamed: 0,id,question
528,8276e8fd-3bc6-44ac-b6c6-ce78985c7099,How long does it take to cook Stuffed Peppers?
331,f946cf79-a23e-413e-a1c1-75d859aefbcf,At what temperature should Dolma be served?
735,865d6381-f4ad-4125-946f-6174e69032dc,What is the recommended temperature to serve S...
17,ec101289-7120-4818-99ec-40345acf99b8,What is the recommended spice level for Tacos?
388,96cc1ca3-31d5-4c98-99ad-dcc6569a2191,What is the minimum amount of time I need to c...
...,...,...
202,d69296f1-90b2-4c19-986c-5a67d1064f15,What type of main course is this recipe
161,5435a146-92b5-4229-816d-b58ded66553e,Can you confirm the main cooking method for ma...
752,47734007-2544-4d23-9deb-b57411a199db,What are the ingredients needed to make Saltim...
375,940feb4d-c0f8-4489-816a-f04342567840,At what temperature should Plov be served?


In [63]:
sample = df_sample.to_dict(orient='records')


In [66]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = qa_function(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    try:
        evaluation = json.loads(evaluation)
        evaluations.append((record, answer_llm, evaluation))
    except:
        pass

100%|██████████| 200/200 [19:02<00:00,  5.71s/it]


In [67]:
evaluations[0]

({'id': '8276e8fd-3bc6-44ac-b6c6-ce78985c7099',
  'question': 'How long does it take to cook Stuffed Peppers?'},
 'It takes 60 minutes to cook Stuffed Peppers.',
 {'Relevance': 'RELEVANT',
  'Explanation': 'The generated answer directly addresses the question by providing a specific cooking time for Stuffed Peppers, making it a relevant response.'})

In [68]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [70]:
df_eval.relevance.value_counts(normalize=True)


relevance
RELEVANT           0.809783
PARTLY_RELEVANT    0.146739
NON_RELEVANT       0.043478
Name: proportion, dtype: float64

In [73]:
df_eval[df_eval["relevance"] == "NON_RELEVANT"][["question", "answer"]]

Unnamed: 0,question,answer
20,What is the recommended serving temperature fo...,The provided information does not include deta...
30,What is the recommended spice level for this d...,"Based on the information provided, the recomme..."
45,What is the recommended spice level for this d...,This query relates to one dish and as per that...
64,What is the spice level of Tzatziki dip?,The information provided about Tzatziki dip do...
122,What is the average time it takes to cook Onig...,"Unfortunately, there is no information about O..."
137,Can I make Moussaka without cooking the eggpla...,To determine how uncooking or frying differs \...
159,What is the main cooking method used for this ...,The main cooking method used for this dessert ...
172,What is the main cooking method used in this r...,There are multiple dishes in the information p...


In [74]:
df_eval.to_csv('../data/rag-eval-llama-3.1-70b-versatile-groq.csv', index=False)
