In [1]:
import pandas as pd
import json
import os
from tqdm.auto import tqdm
from dotenv import load_dotenv

In [2]:
data=pd.read_json('../data/cleandata_recipes.json')

In [3]:
recipes=data.to_dict(orient='records')

In [5]:
print(recipes[0])

{'title': 'Creamy Mashed Potatoes', 'tags': "['potato', 'side', 'cheesefare']", 'introduction': "![Creamy Mashed Potatoes](/pix/creamy-mashed-potatoes.webp) Mashed potatoes is a really great recipe that is often relegated to the position of side dish. This recipe is a spin of the classical mashed potatoes recipe that's got itself more going on. You can serve this dish for a relatively light meal, or you can also serve it as a side dish if you want to have a really hearty meal.", 'ingredients': 'The quantities here are for about four adult portions. If you are planning on eating this as a side dish, it might be more like 6-8 portions. * 1kg potatoes * 200ml milk* * 200ml mayonnaise* * ~100g cheese * Garlic powder * 12-16 strips of bacon * Butter * 3-4 green onions * Black pepper * Salt  *You can play with the proportions depending on how creamy or dry you want the mashed potatoes to be.', 'direction': '1. Peel and cut the potatoes into medium sized pieces. 2. Put the potatoes in a pot w

In [6]:
from sentence_transformers import SentenceTransformer
model_name='all-MiniLM-L12-v2'
model=SentenceTransformer(model_name)



In [7]:
from elasticsearch import Elasticsearch

es_client=Elasticsearch('http://localhost:9200')

index_settings={
    "settings":{
            "number_of_shards":1,
            "number_of_replicas":0
    },
    "mappings":{
            "properties":{
                    "title":{"type":"text"},
                    "tags":{"type":"keyword"},
                    "id":{"type":"keyword"},
                    "introduction":{"type":"text"},
                    "ingredients":{"type":"text"},
                    "direction":{"type":"text"},
                    "combined_vector":{
                            "type":"dense_vector",
                            "dims":384,
                            "index":True,
                            "similarity":"cosine"
                     },
                      
            },
    }
}
index_name="food_recipes"
es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)                
                    

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'food_recipes'})

In [8]:
from tqdm.auto import tqdm

for recipe in tqdm(recipes):
            title = recipe['title']
            ingredients = recipe['ingredients']
            direction = recipe['direction']
            recipe['combined_vector']=model.encode(title + ' '+ ingredients + ' '+ direction )
            try:
                es_client.index(index=index_name,document=recipe)  
            except Exception as e:
                print(e)

  0%|          | 0/360 [00:00<?, ?it/s]

In [26]:
query='tell us a recipe with banana'

In [27]:
question_vector = model.encode(query)

In [28]:
knn_query = {
    "field": "combined_vector",
    "query_vector": question_vector,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
    }

In [29]:
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["tags", "title", "direction"],
                "type": "best_fields",
                "boost": 0.5,
            }
        },
        
    }
}

In [30]:
response = es_client.search(
    index=index_name,
    query=keyword_query,
    knn=knn_query,
    size=5
)

In [31]:
response['hits']['hits']

[{'_index': 'food_recipes',
  '_id': 'CzZoaJIBLgUo6blykCf5',
  '_score': 6.3481503,
  '_source': {'title': 'Sunday Morning Milkshake',
   'tags': "['sweet', 'breakfast', 'drink', 'cheesefare']",
   'introduction': "![Sunday Morning Milkshake](/pix/sunday-milkshake.webp) A sweet and refreshing milkshake that barely requires any effort to make. You're not required to make it and drink it on Sunday, it was just almost a tradition in my family to make it on the weekends, and it somehow does taste better on a Saturday or Sunday morning.",
   'ingredients': 'These are for about 4 glasses or 1 liter: * 1 banana (or 2 if they are small) * ~120 grams of berries. Can be any berries you like. * <1 liter of milk * Vanilla * Sugar',
   'direction': "1. Chop the banana so that it fits better in the blender. 2. Add the banana pieces, berries, vanilla and sugar to taste to the blender.   Usually I use about 6 tea spoons of sugar. 3. Pour enough milk so that the mix fills a liter of the blender volume.

In [32]:
df_ground_truth=pd.read_csv('../data/ground_truth_data.csv')

In [33]:
ground_truth=df_ground_truth.to_dict(orient='records')

In [43]:
ground_truth[8]

{'id': 'f114', 'questions': 'Is there a way to make this sauce spicier?'}

In [50]:
len(ground_truth)

1795

In [51]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [52]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [65]:
def elastic_search_hybrid(field, query, vector):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 1795 ,
        "boost": 0.5,
        
    }

    
    search_query = {
        "knn": knn_query,
        "size": 5,
        "_source": ["title","tags","introduction","ingredients","direction"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [66]:
def question_hybrid(q):
    question = q['questions']
    v_q = model.encode(question)

    return elastic_search_hybrid('combined_vector', question, v_q)

In [74]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in ground_truth:
        doc_id = q['id']
        results = search_function(q)

        if not results:
            print("No results returned for query:", q)
            continue

        relevance = [d.get('id') == doc_id for d in results]
        print("Relevance for query:", relevance)
        relevance_total.append(relevance)

    hit_rate = hit_rate(relevance_total)
    mrr = mrr(relevance_total)
    print("Hit Rate:", hit_rate)
    print("MRR:", mrr)

    return {'hit_rate': hit_rate, 'mrr': mrr}



In [75]:
evaluate(ground_truth, question_hybrid)

Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, False, False, False, False]
Relevance for query: [False, Fa

UnboundLocalError: cannot access local variable 'hit_rate' where it is not associated with a value