In [1]:
import pandas as pd
import json
import os
from dotenv import load_dotenv
import openai

# Data Ingestion and Indexing

In [5]:
df=pd.read_json('../data/recipes.json')

In [6]:
receipes=df.to_dict(orient='records')

In [7]:
print(len(receipes))

360


In [8]:
clean_recipes = []
for recipe in receipes:
    clean_recipe = {key: value for key, value in recipe.items() if key not in ['output', 'date']}
    clean_recipes.append(clean_recipe)

In [9]:
clean_recipes[0]

{'title': 'Creamy Mashed Potatoes',
 'tags': ['potato', 'side', 'cheesefare'],
 'introduction': "![Creamy Mashed Potatoes](/pix/creamy-mashed-potatoes.webp) Mashed potatoes is a really great recipe that is often relegated to the position of side dish. This recipe is a spin of the classical mashed potatoes recipe that's got itself more going on. You can serve this dish for a relatively light meal, or you can also serve it as a side dish if you want to have a really hearty meal.",
 'ingredients': 'The quantities here are for about four adult portions. If you are planning on eating this as a side dish, it might be more like 6-8 portions. * 1kg potatoes * 200ml milk* * 200ml mayonnaise* * ~100g cheese * Garlic powder * 12-16 strips of bacon * Butter * 3-4 green onions * Black pepper * Salt  *You can play with the proportions depending on how creamy or dry you want the mashed potatoes to be.',
 'direction': '1. Peel and cut the potatoes into medium sized pieces. 2. Put the potatoes in a pot

In [10]:
from sentence_transformers import SentenceTransformer
model_name='all-MiniLM-L12-v2'
model=SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [12]:
from elasticsearch import Elasticsearch

es_client=Elasticsearch('http://localhost:9200')

index_settings={
    "settings":{
            "number_of_shards":1,
            "number_of_replicas":0
    },
    "mappings":{
            "properties":{
                    "title":{"type":"text"},
                    "tags":{"type":"text"},
                    "introduction":{"type":"text"},
                    "ingredients":{"type":"text"},
                    "direction":{"type":"text"},
                    "combined_vector":{
                            "type":"dense_vector",
                            "dims":384,
                            "index":True,
                            "similarity":"cosine"
                     },
            },
    }
}
index_name="food_recipes"
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)                
                    
                        
    
    

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'food_recipes'})

In [13]:
from tqdm.auto import tqdm

for recipe in tqdm(clean_recipes):
            title = recipe['title']
            ingredients = recipe['ingredients']
            direction = recipe['direction']
            recipe['combined_vector']=model.encode(title + ' '+ ingredients + ' '+ direction )
            try:
                es_client.index(index=index_name,document=recipe)  
            except Exception as e:
                print(e)
            
 

  0%|          | 0/360 [00:00<?, ?it/s]

In [14]:
def elastic_search_knn_query(field,vector):
            knn={
                "field":field,
                "query_vector":vector,
                "k":5,
                "num_candidates": 360,
            }
            search_query={
                "knn":knn,
                "_source":["title","tags","introduction","ingredients","direction"]
            }
            es_results=es_client.search(index=index_name,body=search_query)
            results=[]
            for hit in es_results['hits']['hits']:
                    results.append(hit['_source'])
            return results
            

In [28]:
def vector_query(q):
        question=q['questions']
        vect_query=model.encode(question)
        #print(vect_query)
        return elastic_search_knn_query('combined_vector',vect_query)
            

In [16]:
#vector_query(dict(
    #question='how to make mashed potatos?'
    #))

# RAG flow

In [17]:
def build_prompt(query,search_results):
        prompt_template = """
            you are a chefs assistant . Answer the QUESTION based on CONTEXT from the  database.
            use only the facts from the CONTEXT when answering the QUESTION.
        QUESTION:{question}
        CONTEXT:{context}
        """.strip()
        context=""
        for doc in search_results:
                    context = context + f"title : {doc['title']} \n tags : {doc['tags']} \n introduction : {doc['introduction']} \n ingredients :{doc['ingredients']} \n direction : {doc['direction']}\n\n"
        prompt=prompt_template.format(question=query,context=context).strip()
        return prompt
        
        

In [18]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [19]:
from openai import OpenAI
client = OpenAI()
def llm(prompt,model='gpt-4o-mini'):
        response=client.chat.completions.create(model=model,
                                            messages=[{"role":"user","content":prompt}]
                                            )
        return response.choices[0].message.content
        

In [20]:
def rag(query:dict , model='gpt-4o-mini') -> str:
            search_results=vector_query(query)
            prompt=build_prompt(query,search_results)
            #print(prompt)
            answer=llm(prompt,model=model)
            return answer

In [31]:
question=dict(questions='tell me a recipe with potato and cheese with minimum preparation time?')
answer=rag(question)
print(answer)

A quick recipe featuring potatoes and cheese with minimal preparation time is the "Irish Potato Casserole." Here’s how to make it:

**Prep Time:** 5 min  
**Cook Time:** 45 min  
**Servings:** 6  

**Ingredients:**
- 2 cups Potatoes, peeled and shredded
- 1/2 cup Butter, melted
- 2 whole Eggs, beaten
- 1 tsp Onion, minced
- 1 tsp Salt
- 1/4 tsp Paprika
- 1/2 cup Milk
- 1/2 cup Sharp Cheddar Cheese, shredded

**Directions:**
1. Preheat the oven to 350°F and butter a 1.5 quart baking dish.
2. In a medium bowl, combine the shredded potatoes, melted butter, eggs, minced onion, salt, and paprika. Mix well.
3. Place the potato mixture into the prepared baking dish and pour milk over the top. 
4. Bake in the preheated oven for 40 minutes. Sprinkle the top with cheese and return to the oven until the cheese melts and is slightly browned.

This recipe is quick to prepare and delicious!


## RAG evaluation

In [22]:
ground_truth_data=pd.read_csv('../data/ground_truth_data.csv')

In [23]:
ground_truth_data.head()

Unnamed: 0,id,questions
0,83f4,What type of potatoes are best for making crea...
1,83f4,Can I use a substitute for mayonnaise in the r...
2,83f4,How can I adjust the recipe if I want to make ...
3,83f4,What can I do to make the mashed potatoes extr...
4,83f4,Is it possible to prepare the creamy mashed po...


In [25]:
ground_truth=ground_truth_data.to_dict(orient='records')

In [34]:
ground_truth[8]

{'id': 'f114',
 'questions': 'What are some ideal dishes to pair with this red sauce besides pasta?'}

In [35]:
rag(ground_truth[8])

'Some ideal dishes to pair with this red sauce, besides pasta, could include:\n\n- Rice dishes, as red sauce can complement the flavors well.\n- Grilled or roasted vegetables, which can be drizzled with the sauce.\n- Polenta, allowing the sauce to be served on top for added flavor.\n- Baked or stuffed meats (like chicken or eggplant), where the red sauce can serve as a delicious topping.\n- Dipping bread or using it as a spread in sandwiches to enhance the flavors.'

In [36]:
answers={}

In [37]:
for i, idx in enumerate(tqdm(ground_truth)):
    if i in answers:
            continue
    answer_llm=rag(idx)
    question=idx['questions']
    q_id=idx['id']

    answers[i]={
        "answer_llm":answer_llm,
        "question": question,
        "ques_id":q_id
    }

  0%|          | 0/1780 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
results_gpt-4o-mini=[None]*len(ground_truth)
for i,val in answers.items():
        results_gpt-4o-mini=val.copy
        results_gpt-4o-mini.update(ground_truth[i])       
        

In [None]:
df_gpt4omini=pd.DataFrame(results_gpt-4o-mini)

In [None]:
df_gpt4omini.to_csv('../data/results-gpt4omini.csv',index=False)

In [None]:
prompt_eval_template="""
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
df_sample=df_gpt4omini.sample(n=150,random_state=1)

In [None]:
sample=df_sample.to_dict(orient='records')

In [None]:
evaluations=[]
    for record in tqdm(sample):
        prompt=prompt_eval_template.format(
                                question=question,
                                answer_llm=answer_llm)
        evaluation=llm(prompt,model='gpt-4o-mini')
        evaluation=json.loads(evaluation)

        evaluations.append((record,answer_llm,evaluation))
