In [1]:
import pandas as pd
import json
import os
from tqdm.auto import tqdm
from dotenv import load_dotenv

In [2]:
data=pd.read_json('../data/cleandata_recipes.json')

In [3]:
recipes=data.to_dict(orient='records')

In [4]:
print(recipes[0])

{'title': 'Creamy Mashed Potatoes', 'tags': "['potato', 'side', 'cheesefare']", 'introduction': "![Creamy Mashed Potatoes](/pix/creamy-mashed-potatoes.webp) Mashed potatoes is a really great recipe that is often relegated to the position of side dish. This recipe is a spin of the classical mashed potatoes recipe that's got itself more going on. You can serve this dish for a relatively light meal, or you can also serve it as a side dish if you want to have a really hearty meal.", 'ingredients': 'The quantities here are for about four adult portions. If you are planning on eating this as a side dish, it might be more like 6-8 portions. * 1kg potatoes * 200ml milk* * 200ml mayonnaise* * ~100g cheese * Garlic powder * 12-16 strips of bacon * Butter * 3-4 green onions * Black pepper * Salt  *You can play with the proportions depending on how creamy or dry you want the mashed potatoes to be.', 'direction': '1. Peel and cut the potatoes into medium sized pieces. 2. Put the potatoes in a pot w

In [5]:
from sentence_transformers import SentenceTransformer
model_name='all-MiniLM-L12-v2'
model=SentenceTransformer(model_name)



In [6]:
from elasticsearch import Elasticsearch

es_client=Elasticsearch('http://localhost:9200')

index_settings={
    "settings":{
            "number_of_shards":1,
            "number_of_replicas":0
    },
    "mappings":{
            "properties":{
                    "title":{"type":"text"},
                    "tags":{"type":"keyword"},
                    "id":{"type":"keyword"},
                    "introduction":{"type":"text"},
                    "ingredients":{"type":"text"},
                    "direction":{"type":"text"},
                    "combined_vector":{
                            "type":"dense_vector",
                            "dims":384,
                            "index":True,
                            "similarity":"cosine"
                     },
                      
            },
    }
}
index_name="food_recipes"
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)                
                    

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'food_recipes'})

In [7]:
from tqdm.auto import tqdm

for recipe in tqdm(recipes):
            title = recipe['title']
            ingredients = recipe['ingredients']
            direction = recipe['direction']
            recipe['combined_vector']=model.encode(title + ' '+ ingredients + ' '+ direction )
            try:
                es_client.index(index=index_name,document=recipe)  
            except Exception as e:
                print(e)

  0%|          | 0/360 [00:00<?, ?it/s]

In [8]:
query="What cooking method should I use for the Red Sauce (Ragu all Italiana)"

In [9]:
question_vector = model.encode(query)

In [10]:
knn_query = {
    "field": "combined_vector",
    "query_vector": question_vector,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
    }

In [11]:
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["tags", "title", "direction"],
                "type": "best_fields",
                "boost": 0.5,
            }
        },
        
    }
}

In [12]:
response = es_client.search(
    index=index_name,
    query=keyword_query,
    knn=knn_query,
    size=5
)

In [13]:
response['hits']['hits']

[{'_index': 'food_recipes',
  '_id': 'iPm7aJIBW3litJhB7t1p',
  '_score': 7.4374833,
  '_source': {'title': 'Ragu Napoletano',
   'tags': "['tomato', 'sauce', 'italian', 'pork', 'veal']",
   'introduction': 'This recipe is known by Italian-Americans as "Sunday Gravy", which originated from the Southern Italian dish Ragu Napoletano. This is my variation of my family\'s version which was passed down 3 generations to me. ![Ragu Napoletano](pix/ragu-napoletano-01.webp) - ⏲️ Prep time: 30-60 min - 🍳Cook time: 4-5 hrs - 🍽️ Servings: 10-12',
   'ingredients': '### Tomato Sauce - 6-10 lbs San Marzano plum tomatoes (fresh or canned peeled) - Tomato paste (optional) ### Aromatics and Herbs - 8 cloves garlic - 1.5 white onions - 6-8 oz red wine - 8 fresh basil leaves - Dry oregano - Red pepper ### Meat - 1-2 lb pork tenderloin - 1-2 lb sweet and or spicy Italian sausage - 1 lb cubed stewing beef (chuck, short rib, or oxtail) ### Meatballs - 1-2 lb total of ground beef chuck, veal, and pork - 1 1/2

In [14]:
df_ground_truth=pd.read_csv('../data/ground_truth_data_imp.csv')

In [15]:
ground_truth=df_ground_truth.to_dict(orient='records')

In [16]:
ground_truth[8]

{'id': 'f114',
 'questions': "What cooking method should I use for the Red Sauce (Ragu all'Italiana)?"}

In [17]:
len(ground_truth)

1795

In [18]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [19]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [20]:
def elastic_search_hybrid(field, query, vector):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000 ,
        "boost": 0.5,
        
    }
    keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["tags", "title", "direction"],
                "type": "best_fields",
                "boost": 0.5,
            }
        },
        
    }
}
    
    
    es_results = es_client.search(
        index=index_name,
        query=keyword_query,
        knn=knn_query,
        size=5
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [21]:
def question_hybrid(q):
    question = q['questions']
    v_q = model.encode(question)

    return elastic_search_hybrid('combined_vector', question, v_q)

In [22]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in ground_truth:
        doc_id = q['id']
        results = search_function(q)

        if not results:
            print("No results returned for query:", q)
            continue

        relevance = [d.get('id') == doc_id for d in results]
        #print("Relevance for query:", relevance)
        relevance_total.append(relevance)

    # Rename the variable to avoid conflict
    hr = hit_rate(relevance_total)
    mrr_value = mrr(relevance_total)
    print("Hit Rate:", hr)
    print("MRR:", mrr_value)

    return {'hit_rate': hr, 'mrr': mrr_value}



In [23]:
evaluate(ground_truth, question_hybrid)

Hit Rate: 0.9654596100278552
MRR: 0.8497400185701018


{'hit_rate': 0.9654596100278552, 'mrr': 0.8497400185701018}