<a href="https://colab.research.google.com/github/sonalvrshny/IR23-MRRS/blob/sonal-search-queries/Project-2_changes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install nltk spacy tensorflow torch


In [4]:
pip install langdetect



In [None]:
!pip install sentence_transformers
!pip install faiss-cpu

In [None]:
# For english
!python -m spacy download en_core_web_sm

# For spanish
!python -m spacy download es_core_news_sm

# For hindi
!python -m spacy download xx_ent_wiki_sm


In [None]:
pip install googletrans==4.0.0-rc1

In [9]:
# Required imports
import json
from langdetect import detect
import spacy
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import langid


In [16]:
from inltk.inltk import setup, tokenize


In [8]:
# Loading and setting up language models
nlp_en = spacy.load("en_core_web_sm")  # Multilingual model for English
nlp_es = spacy.load("es_core_news_sm") # Multilingual model for Spanish
nlp_hi = spacy.load("xx_ent_wiki_sm")  # Multilingual model for Hindi

In [15]:
# Advanced query parsing for Hindi
def parse_hindi_query(query):
    return tokenize(query, 'hi')

In [10]:
from langdetect import detect

def detect_language(query):
    try:
        return langid.classify(query)[0]
        # return detect(query)
    except Exception as e:
        return "Error: " + str(e)


In [21]:
import spacy

# Load the language model for each language
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")
nlp_hi = spacy.load("en_core_web_sm")  # Hindi is not directly supported, so using a multilingual model

def parse_query(query, lang):
    if lang == 'en':
        doc = nlp_en(query)
    elif lang == 'es':
        doc = nlp_es(query)
        print(doc)
    elif lang == 'hi':
        doc = parse_hindi_query(query)
    else:
        return "Unsupported language"

    keywords = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return keywords


In [24]:
def process_query(query):
    lang = detect_language(query)
    if "Error" in lang:
        return lang

    if lang == 'hi':
        # For Hindi, directly use the tokenized output
        keywords = parse_hindi_query(query)
    else:
        # For other languages, continue using spaCy
        if lang == 'en':
            doc = nlp_en(query)
        elif lang == 'es':
            doc = nlp_es(query)
        else:
            return "Unsupported language"
        keywords = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    return {"language": lang, "keywords": keywords}



In [25]:
sample_queries = ["How to make traditional Mexican guacamole?", "Receta de paella de mariscos", "बर्गर चिकन की रेसिपी"]

for query in sample_queries:
    result = process_query(query)
    print(f"Query: {query}\nResult: {result}\n")


Query: How to make traditional Mexican guacamole?
Result: {'language': 'en', 'keywords': ['traditional', 'mexican', 'guacamole']}

Query: Receta de paella de mariscos
Result: {'language': 'es', 'keywords': ['Receta', 'paella', 'marisco']}

Query: बर्गर चिकन की रेसिपी
Result: {'language': 'hi', 'keywords': ['▁बर्ग', 'र', '▁चिकन', '▁की', '▁रे', 'सिप', 'ी']}



In [102]:
from googletrans import Translator

def translate_query(query, target_language):
    translator = Translator()
    translation = translator.translate(query, dest=target_language)
    return translation.text

def get_translated_queries(user_query):
    lang = detect_language(user_query)
    print("\n")
    print("The detected language for the query is: ", lang)
    if lang == "hi":
        translated_queries = {
                'hindi': user_query,
                'spanish': translate_query(user_query, 'es'),
                'english': translate_query(user_query, 'en')
            }
    elif lang == "es":
        translated_queries = {
                'hindi': translate_query(user_query, 'hi'),
                'spanish': user_query,
                'english': translate_query(user_query, 'en')
            }
    elif lang == "en":
        translated_queries = {
                'hindi': translate_query(user_query, 'hi'),
                'spanish': translate_query(user_query, 'es'),
                'english': user_query
            }
    else : # added this else, if there is a scenario of no language detected, translate query into all lang
        translated_queries = {
                'hindi': translate_query(user_query, 'hi'),
                'spanish': translate_query(user_query, 'es'),
                'english': translate_query(user_query, 'en')
        }

    return translated_queries

user_queries = ["How to make traditional Mexican guacamole?", "Receta de paella de mariscos", "बटर चिकन की रेसिपी"]
for q in user_queries:
    print(get_translated_queries(q))



The detected language for the query is:  en
{'hindi': 'पारंपरिक मैक्सिकन गुआकामोल कैसे बनाएं?', 'spanish': '¿Cómo hacer guacamole mexicano tradicional?', 'english': 'How to make traditional Mexican guacamole?'}


The detected language for the query is:  es
{'hindi': 'सीफूड पेला नुस्खा', 'spanish': 'Receta de paella de mariscos', 'english': 'Seafood Paella Recipe'}


The detected language for the query is:  hi
{'hindi': 'बटर चिकन की रेसिपी', 'spanish': 'Receta de pollo con mantequilla', 'english': 'Butter chicken recipe'}


In [103]:
# Load recipe JSON data
with open('recipes.json', 'r') as file:
    recipes = json.load(file)

In [82]:
# Embedding function for text
def embed_text(text, model):
    return model.encode(text)

In [75]:
import torch

In [81]:
# Function to embed a recipe
def embed_recipe(recipe, model):
    combined_text = f"{recipe['recipeName']} {' '.join(recipe['ingredients'])} {' '.join(recipe['instruction'])}"
    return embed_text(combined_text, model)

In [83]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [84]:
# Embedding recipes and building FAISS index
database_embeddings = [embed_recipe(recipe, model) for recipe in recipes]
dim = len(database_embeddings[0])
index = faiss.IndexFlatL2(dim)
index.add(np.array(database_embeddings).astype('float32'))

In [85]:
# Function to search the database
def search_database(query_embedding, index, database, k=5):
    _, indices = index.search(np.array([query_embedding]).astype('float32'), k)
    return [database[i] for i in indices[0]]

In [86]:
# from sentence_transformers import SentenceTransformer, util

# def calculate_ingredient_similarity(recipe_ingredients, query_keywords):
#     recipe_ingredients_str = ' '.join(recipe_ingredients)
#     query_keywords_str = ' '.join(query_keywords)

#     recipe_emb = model.encode(recipe_ingredients_str, convert_to_tensor=True)
#     query_emb = model.encode(query_keywords_str, convert_to_tensor=True)

#     similarity_score = util.pytorch_cos_sim(recipe_emb, query_emb).item()
#     return similarity_score

# def calculate_recipe_alignment(recipe, query_keywords):
#     recipe_tags_str = ' '.join(recipe)
#     query_keywords_str = ' '.join(query_keywords)

#     recipe_emb = model.encode(recipe_tags_str, convert_to_tensor=True)
#     query_emb = model.encode(query_keywords_str, convert_to_tensor=True)

#     alignment_score = util.pytorch_cos_sim(recipe_emb, query_emb).item()
#     return alignment_score


In [87]:
# def calculate_score(recipe, query_keywords, query_language):
#     ingredient_similarity = calculate_ingredient_similarity(recipe['ingredients'], query_keywords)
#     language_match = 1 if detect_language(recipe['recipeName']) == query_language else 0
#     recipe_alignment = calculate_recipe_alignment(recipe['instruction'], query_keywords)
#     score = (ingredient_similarity * 0.2) + (language_match * 0.4) + (recipe_alignment * 0.4)
#     return score

In [92]:
# from sentence_transformers import util

# def calculate_score(recipe, query_embedding, model):
#     # Embed the entire recipe content
#     recipe_embedding = embed_recipe(recipe, model)

#     # Calculate semantic similarity
#     semantic_similarity = util.pytorch_cos_sim(recipe_embedding, query_embedding).item()

#     # Old scoring components
#     ingredient_similarity = calculate_ingredient_similarity(recipe['ingredients'], query_embedding)
#     language_match = 1 if detect_language(recipe['recipeName']) == detect_language(query_embedding) else 0
#     recipe_alignment = calculate_recipe_alignment(recipe['instruction'], query_embedding)

#     # Combine scores with adjusted weights
#     score = (semantic_similarity * 0.5) + (ingredient_similarity * 0.2) + (language_match * 0.2) + (recipe_alignment * 0.1)
#     return score


In [95]:
from sentence_transformers import util

# Calculates ingredient similarity between recipe ingredients and query
def calculate_ingredient_similarity(recipe_ingredients, query_embedding, model):
    recipe_ingredients_str = ' '.join(recipe_ingredients)

    recipe_emb = model.encode(recipe_ingredients_str, convert_to_tensor=True)

    # Calculate cosine similarity between recipe and query embeddings
    similarity_score = util.pytorch_cos_sim(recipe_emb, query_embedding).item()

    return similarity_score

# Calculates alignment of the recipe with user's dietary preferences
def calculate_recipe_alignment(recipe_instructions, query_embedding, model):

    recipe_instructions_str = ' '.join(recipe_instructions)

    recipe_emb = model.encode(recipe_instructions_str, convert_to_tensor=True)

    # Calculate cosine similarity between recipe instructions and query embeddings
    alignment_score = util.pytorch_cos_sim(recipe_emb, query_embedding).item()
    return alignment_score


# Calculates the overall score for a recipe based on various components
def calculate_score(recipe, query_embedding, model):
    recipe_embedding = embed_recipe(recipe, model)
    # Calculate semantic similarity between recipe and query embeddings
    semantic_similarity = util.pytorch_cos_sim(recipe_embedding, query_embedding).item()
    # Calculate ingredient similarity
    ingredient_similarity = calculate_ingredient_similarity(recipe['ingredients'], query_embedding, model)
    language_match = 1 if detect_language(recipe['recipeName']) == detect_language(query_embedding) else 0
    # Calculate alignment with instructions
    recipe_alignment = calculate_recipe_alignment(recipe['instruction'], query_embedding, model)
    # Combine all scores with adjusted weights
    score = (semantic_similarity * 0.5) + (ingredient_similarity * 0.2) + (language_match * 0.2) + (recipe_alignment * 0.1)
    return score


In [96]:
from collections import defaultdict
# Function to search and score recipes
def search_and_score_recipes(query):
    translated_queries = get_translated_queries(query)

    # Score the search results
    scored_results = defaultdict(list)
    for lang, translated_query in translated_queries.items():
        # Embed the translated queries
        query_embedding = embed_text(translated_query, model)
        search_results = search_database(query_embedding, index, recipes)
        for recipe in search_results:
            score = calculate_score(recipe, query_embedding, model)
            scored_results[lang].append((recipe, score))

    # sort each result in the dictionary
    for lang, results in scored_results.items():
        scored_results[lang] = sorted(results, key=lambda x: x[1], reverse=True)
    return scored_results

In [108]:
user_query = "चिकन करी"
results = search_and_score_recipes(user_query)

for lang, result in results.items():
    print(f"===== Results for {lang.upper()} =====")
    for idx, (recipe, score) in enumerate(result, 1):
        print(f"{idx}. Recipe: {recipe['recipeName']}\n   Score: {score}\n")
    print("-" * 40)




The detected language for the query is:  hi
===== Results for HINDI =====
1. Recipe: रिसोटो
   Score: 0.5913658857345581

2. Recipe: टाकोस
   Score: 0.5747587442398071

3. Recipe: चिकन करी
   Score: 0.5744102895259857

4. Recipe: गजपाचो
   Score: 0.5739732503890992

5. Recipe: बिरयानी
   Score: 0.562332683801651

----------------------------------------
===== Results for SPANISH =====
1. Recipe: Pollo al Curry
   Score: 0.2990070402622223

2. Recipe: Burger
   Score: 0.28818393051624297

3. Recipe: Biryani
   Score: 0.28458072543144225

4. Recipe: Paneer Tikka
   Score: 0.2827745795249939

5. Recipe: Paella
   Score: 0.28202430009841917

----------------------------------------
===== Results for ENGLISH =====
1. Recipe: Chicken Curry
   Score: 0.4906628847122192

2. Recipe: Paella
   Score: 0.390840619802475

3. Recipe: Biryani
   Score: 0.3879463255405426

4. Recipe: Tacos
   Score: 0.3610280632972718

5. Recipe: Ramen
   Score: 0.3151261150836945

----------------------------------