In [1]:
import pandas as pd
import json
import os
from dotenv import load_dotenv
import openai
from tqdm.auto import tqdm

# Data Ingestion and Indexing

In [2]:
df=pd.read_json('../data/recipes.json')

In [3]:
receipes=df.to_dict(orient='records')

In [4]:
print(len(receipes))

360


In [5]:
clean_recipes = []
for recipe in receipes:
    clean_recipe = {key: value for key, value in recipe.items() if key not in ['output', 'date']}
    clean_recipes.append(clean_recipe)

In [6]:
clean_recipes[0]

{'title': 'Creamy Mashed Potatoes',
 'tags': ['potato', 'side', 'cheesefare'],
 'introduction': "![Creamy Mashed Potatoes](/pix/creamy-mashed-potatoes.webp) Mashed potatoes is a really great recipe that is often relegated to the position of side dish. This recipe is a spin of the classical mashed potatoes recipe that's got itself more going on. You can serve this dish for a relatively light meal, or you can also serve it as a side dish if you want to have a really hearty meal.",
 'ingredients': 'The quantities here are for about four adult portions. If you are planning on eating this as a side dish, it might be more like 6-8 portions. * 1kg potatoes * 200ml milk* * 200ml mayonnaise* * ~100g cheese * Garlic powder * 12-16 strips of bacon * Butter * 3-4 green onions * Black pepper * Salt  *You can play with the proportions depending on how creamy or dry you want the mashed potatoes to be.',
 'direction': '1. Peel and cut the potatoes into medium sized pieces. 2. Put the potatoes in a pot

In [7]:
from sentence_transformers import SentenceTransformer
model_name='all-MiniLM-L12-v2'
model=SentenceTransformer(model_name)



In [77]:
from elasticsearch import Elasticsearch

es_client=Elasticsearch('http://localhost:9200')

index_settings={
    "settings":{
            "number_of_shards":1,
            "number_of_replicas":0
    },
    "mappings":{
            "properties":{
                    "title":{"type":"text"},
                    "tags":{"type":"text"},
                    "introduction":{"type":"text"},
                    "ingredients":{"type":"text"},
                    "direction":{"type":"text"},
                    "combined_vector":{
                            "type":"dense_vector",
                            "dims":384,
                            "index":True,
                            "similarity":"cosine"
                     },
            },
    }
}
index_name="food_recipes"
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)                
                    
                        
    
    

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'food_recipes'})

In [78]:
from tqdm.auto import tqdm

for recipe in tqdm(clean_recipes):
            title = recipe['title']
            ingredients = recipe['ingredients']
            direction = recipe['direction']
            recipe['combined_vector']=model.encode(title + ' '+ ingredients + ' '+ direction )
            try:
                es_client.index(index=index_name,document=recipe)  
            except Exception as e:
                print(e)
            
 

  0%|          | 0/360 [00:00<?, ?it/s]

In [10]:
def elastic_search_knn_query(field,vector):
            knn={
                "field":field,
                "query_vector":vector,
                "k":5,
                "num_candidates": 360,
            }
            search_query={
                "knn":knn,
                "_source":["title","tags","introduction","ingredients","direction"]
            }
            es_results=es_client.search(index=index_name,body=search_query)
            results=[]
            for hit in es_results['hits']['hits']:
                    results.append(hit['_source'])
            return results
            

In [11]:
def vector_query(q):
        question=q['questions']
        vect_query=model.encode(question)
        #print(vect_query)
        return elastic_search_knn_query('combined_vector',vect_query)
            

In [12]:
#vector_query(dict(
    #question='how to make mashed potatos?'
    #))

# RAG flow

In [12]:
def build_prompt(query,search_results):
        prompt_template = """
            you are a chefs assistant . Answer the QUESTION based on CONTEXT from the  database.
            use only the facts from the CONTEXT when answering the QUESTION.
        QUESTION:{question}
        CONTEXT:{context}
        """.strip()
        context=""
        for doc in search_results:
                    context = context + f"title : {doc['title']} \n tags : {doc['tags']} \n introduction : {doc['introduction']} \n ingredients :{doc['ingredients']} \n direction : {doc['direction']}\n\n"
        prompt=prompt_template.format(question=query,context=context).strip()
        return prompt
        
        

In [13]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [14]:
from openai import OpenAI
client = OpenAI()
def llm(prompt,model='gpt-4o-mini'):
        response=client.chat.completions.create(model=model,
                                            messages=[{"role":"user","content":prompt}]
                                            )
        return response.choices[0].message.content
        

In [16]:
def rag(query:dict , model='gpt-4o-mini') -> str:
            search_results=vector_query(query)
            prompt=build_prompt(query,search_results)
            #print(prompt)
            answer=llm(prompt,model=model)
            return answer

In [17]:
question=dict(questions='tell me a recipe with potato and cheese with minimum preparation time?')
answer=rag(question)
print(answer)

For a quick recipe featuring potatoes and cheese, consider making the **Irish Potato Casserole**. It has a minimum preparation time of just 5 minutes. 

**Ingredients:**
- 2 cups Potatoes, peeled and shredded
- 1/2 cup Butter, melted
- 2 whole Eggs, beaten
- 1 tsp Onion, minced
- 1 tsp Salt
- 1/4 tsp Paprika
- 1/2 cup Milk
- 1/2 cup Sharp Cheddar Cheese, shredded

**Directions:**
1. Preheat oven to 350F. Butter a 1.5 quart baking dish.
2. In a medium bowl, combine the shredded potatoes, melted butter, eggs, minced onion, salt, and paprika. Mix well.
3. Place the potato mixture into the prepared baking dish and pour milk over the top.
4. Bake in the preheated oven for 40 minutes. Sprinkle the top with cheese and return to the oven until the cheese melts and is slightly browned. 

This dish is easy to prepare and delicious!


In [43]:
ground_truth_data=pd.read_csv('../data/ground_truth_data.csv')

In [44]:
ground_truth_data.head()

Unnamed: 0,id,questions
0,83f4,What type of potatoes are best for creamy mash...
1,83f4,Can I use a different type of cheese if I don'...
2,83f4,How do I adjust the recipe for a larger group ...
3,83f4,Is there a substitute for mayonnaise in this r...
4,83f4,What's a good alternative to bacon for a veget...


In [45]:
ground_truth_data=ground_truth_data.sample(n=250,random_state=1)

In [46]:
sample=ground_truth_data.to_dict(orient='records')

In [68]:
print(sample[9])

{'id': '4ab9', 'questions': 'What temperature should I heat the oil to for frying the chicken?'}


## RAG evaluation

In [20]:
prompt_eval_template="""
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [26]:
evaluations=[]
for record in tqdm(sample):
        question={'questions': record['questions']} 
        answer_llm=rag(question)
        
        prompt=prompt_eval_template.format(
                                question=question,
                                answer_llm=answer_llm)
        evaluation=llm(prompt,model='gpt-4o-mini')
        evaluation=json.loads(evaluation)

        evaluations.append((record,answer_llm,evaluation))


  0%|          | 0/250 [00:00<?, ?it/s]

In [27]:
df_eval=pd.DataFrame(evaluations,columns=['record','answer_llm','evaluation'])
df_eval['id']=df_eval.record.apply(lambda d:d['id'])
df_eval['question']=df_eval.record.apply(lambda d:d['questions'])
df_eval['relevance']=df_eval.evaluation.apply(lambda d:d['Relevance'])
df_eval['explanation']=df_eval.evaluation.apply(lambda d:d['Explanation'])


In [28]:
df_eval.head()

Unnamed: 0,record,answer_llm,evaluation,id,question,relevance,explanation
0,"{'id': '11ac', 'questions': 'Is there a vegeta...","Yes, you can use a vegetable bouillon or veget...","{'Relevance': 'PARTLY_RELEVANT', 'Explanation'...",11ac,Is there a vegetarian alternative for the vege...,PARTLY_RELEVANT,The generated answer provides information abou...
1,"{'id': '82cb', 'questions': 'What can I substi...",You can substitute bulghur wheat with quinoa f...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",82cb,What can I substitute for bulghur wheat if I w...,RELEVANT,The generated answer directly addresses the qu...
2,"{'id': '7493', 'questions': 'How can I adjust ...",To adjust the spice level for a milder taste i...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",7493,How can I adjust the spice level if I prefer a...,RELEVANT,The generated answer directly addresses the qu...
3,"{'id': 'cddf', 'questions': 'Are there any sug...",The provided context does not specifically men...,"{'Relevance': 'NON_RELEVANT', 'Explanation': '...",cddf,Are there any suggested variations to the reci...,NON_RELEVANT,The generated answer does not address the ques...
4,"{'id': 'c720', 'questions': 'What can I use as...",If you have a nut allergy and need a substitut...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ...",c720,What can I use as a substitute for the crushed...,RELEVANT,The generated answer directly addresses the qu...


In [29]:
del df_eval['record']
del df_eval['evaluation']

In [30]:
df_eval.to_csv('../data/rag_eval_gpt_4o_mini.csv',index=False)

In [31]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.844
PARTLY_RELEVANT    0.140
NON_RELEVANT       0.016
Name: proportion, dtype: float64

In [32]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer_llm,id,question,relevance,explanation
3,The provided context does not specifically men...,cddf,Are there any suggested variations to the reci...,NON_RELEVANT,The generated answer does not address the ques...
96,The context provided does not specify the type...,7332,What type of apple is best to use for the Spic...,NON_RELEVANT,The generated answer does not directly address...
101,The provided context does not specify any garn...,7afb,What are some garnish options to enhance the p...,NON_RELEVANT,The generated answer does not address the ques...
248,The context does not specify a particular type...,207f,What type of potatoes work best for making fri...,NON_RELEVANT,The generated answer does not address the ques...


In [43]:
evaluations_gpt4o=[]
for record in tqdm(sample):
        question={'questions': record['questions']} 
        answer_llm=rag(question)
        
        prompt=prompt_eval_template.format(
                                question=question,
                                answer_llm=answer_llm)
        evaluation=llm(prompt,model='gpt-4o')
        try:
            evaluation = json.loads(evaluation)
        except JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            print(f"Offending JSON string: {evaluation}")

        evaluations_gpt4o.append((record,answer_llm,evaluation))

  0%|          | 0/250 [00:00<?, ?it/s]

In [44]:
df_eval4o=pd.DataFrame(evaluations_gpt4o,columns=['record','answer_llm','evaluation'])
df_eval4o['id']=df_eval4o.record.apply(lambda d:d['id'])
df_eval4o['question']=df_eval4o.record.apply(lambda d:d['questions'])
df_eval4o['relevance']=df_eval4o.evaluation.apply(lambda d:d['Relevance'])
df_eval4o['explanation']=df_eval4o.evaluation.apply(lambda d:d['Explanation'])
del df_eval4o['record']
del df_eval4o['evaluation']

In [45]:
df_eval4o.to_csv('../data/rag_eval_gpt_4o.csv',index=False)

In [1]:
#print(evaluations_gpt4o)

In [47]:
df_eval4o.head()

Unnamed: 0,answer_llm,id,question,relevance,explanation
0,"Yes, you can use vegetable stock or broth as a...",11ac,Is there a vegetarian alternative for the vege...,PARTLY_RELEVANT,The generated answer suggests using vegetable ...
1,If you want a gluten-free option as a substitu...,82cb,What can I substitute for bulghur wheat if I w...,NON_RELEVANT,The generated answer suggests using rice flour...
2,To adjust the spice level for a milder taste i...,7493,How can I adjust the spice level if I prefer a...,PARTLY_RELEVANT,The generated answer provides a relevant appro...
3,The provided context does not explicitly menti...,cddf,Are there any suggested variations to the reci...,RELEVANT,The generated answer directly addresses the qu...
4,"If you have a nut allergy, you can use roasted...",c720,What can I use as a substitute for the crushed...,RELEVANT,The generated answer provides suitable substit...


In [48]:
df_eval4o.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.748
PARTLY_RELEVANT    0.228
NON_RELEVANT       0.024
Name: proportion, dtype: float64

In [49]:
df_eval4o[df_eval4o.relevance == 'RELEVANT']

Unnamed: 0,answer_llm,id,question,relevance,explanation
3,The provided context does not explicitly menti...,cddf,Are there any suggested variations to the reci...,RELEVANT,The generated answer directly addresses the qu...
4,"If you have a nut allergy, you can use roasted...",c720,What can I use as a substitute for the crushed...,RELEVANT,The generated answer provides suitable substit...
5,"To tell when the dough has risen sufficiently,...",94ba,How can I tell when the dough has risen suffic...,RELEVANT,The answer accurately addresses both parts of ...
7,"Yes, it is possible to make Zopf bread ahead o...",92a0,Is it possible to make Zopf bread ahead of tim...,RELEVANT,The generated answer addresses both parts of t...
8,"To make the sauce spicier, you can add chili p...",e731,How can I make the sauce spicier if I like mor...,RELEVANT,The generated answer directly addresses the qu...
...,...,...,...,...,...
244,For making a quesadilla to achieve an authenti...,3aa3,What type of cheese do you recommend for makin...,RELEVANT,The generated answer directly addresses the qu...
245,To make the mashed potatoes extra cheesy witho...,83f4,What can I do to make the mashed potatoes extr...,RELEVANT,The generated answer directly addresses the qu...
246,"Yes, you can add other ingredients like tomato...",4110,Can I add other ingredients like tomatoes or c...,RELEVANT,The generated answer directly addresses the qu...
247,"Yes, you can add vegetables to the soup. For t...",d8c7,"Can I add any vegetables to the soup, and if s...",RELEVANT,The generated answer directly addresses the qu...
