In [11]:
import pandas as pd
import re

In [54]:
recipes_df = pd.read_csv('./data/recipes.csv')
comment_df = pd.read_csv('./data/reviews.csv')

In [55]:
comment_df.dropna(inplace=True)
recipes_df.dropna(inplace=True)

In [56]:
indexImage = recipes_df[( recipes_df['Images'] == 'character(0)')].index
recipes_df.drop(indexImage, inplace=True)

indexImage = recipes_df[(recipes_df['Images'] == 'c("")' )].index
recipes_df.drop(indexImage, inplace=True)


In [57]:
recipes_df.to_csv('./out/recipes.csv', sep=',', encoding='utf-8', index=False)

### Insert to database

In [4]:
import pymongo

In [59]:
import re

def convert_to_array(string):
    # Remove leading and trailing quotation marks if present
    string = string.strip('"\'')

    # Check if the input string is empty
    if not string:
        return []  # Return an empty list if the input is empty

    # Check if the input string contains "c(" and ")"
    if 'c(' in string and ')' in string:
        # Use regular expression to split the string at commas, considering quotes and spaces
        array = re.findall(r'"[^"]*"|[^",\s]+', string)

        array[:] = (value for value in array if value != '')        
        array[:] = (value for value in array if value != 'c(')        
        array[:] = (value for value in array if value != ')')        
        array[:] = (value for value in array if value != ' ')        
        # Remove any leading or trailing whitespace from each element
        array = [s.strip('"') for s in array]
        return list(set(array))
    else:
        return list(set([string]))  # Return the input string as a single-element list

In [70]:
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["food"] 
col = db["recipeDIP"] 

cleaned_recipe_collection = db['cleaned_recipe']

for recipe in col.find({}): 
    cleaned_recipe = {
        'RecipeId': recipe['RecipeId'] ,
        'Name': recipe['Name']  ,
        'AuthorId': recipe['AuthorId'] ,
        'AuthorName': recipe['AuthorName'] ,
        'CookTime':  recipe['CookTime'] ,
        'PrepTime':  recipe['PrepTime'],
        'TotalTime': recipe['TotalTime'] ,
        'DatePublished': recipe['DatePublished'] ,
        'Description': recipe['Description'],
        'Images': convert_to_array(recipe['Images']),
        'RecipeCategory': recipe['RecipeCategory'],
        'Keywords': convert_to_array(recipe['Keywords']),
        'RecipeIngredientQuantities': convert_to_array(recipe['RecipeIngredientQuantities']),
        'RecipeIngredientParts':  convert_to_array(recipe['RecipeIngredientParts']),
        'AggregatedRating': recipe['AggregatedRating'],
        'Calories': recipe['Calories'],
        'FatContent': recipe['FatContent'],
        'SaturatedFatContent': recipe['SaturatedFatContent'],
        'CholesterolContent': recipe['CholesterolContent'],
        'SodiumContent': recipe['SodiumContent'],
        'CarbohydrateContent': recipe['CarbohydrateContent'],
        'FiberContent': recipe['FiberContent'],
        'SugarContent': recipe['SugarContent'],
        'ProteinContent': recipe['ProteinContent'],
        'RecipeServings': recipe['RecipeServings'],
        'RecipeYield': recipe['RecipeYield'],
        'RecipeInstructions':  convert_to_array(recipe['RecipeInstructions']),
    }

    # print(cleaned_recipe)
    cleaned_recipe_collection.insert_one(cleaned_recipe)

In [72]:
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["food"] 
col = db["cleaned_recipe"] 
cleaned_recipe_collection = db['cleaned_recipe']

### Try Query

In [73]:
myquery = { "RecipeCategory": "Quick Breads" }
results = cleaned_recipe_collection.find(myquery).limit(5)

for match in results:
    print(match)

{'_id': ObjectId('65f8121ae3777c5d4262d1cb'), 'RecipeId': 153, 'Name': 'Amish Friendship Bread and Starter', 'AuthorId': 1540, 'AuthorName': 'gowiththefro', 'CookTime': 'PT1H10M', 'PrepTime': 'PT0S', 'TotalTime': 'PT1H10M', 'DatePublished': datetime.datetime(1999, 9, 6, 21, 41), 'Description': "Many recipes have been posted for the Amish bread, but none have included the starter, so I thought I'd send this along.  Happy baking!  Amish Friendship Bread and Starter", 'Images': ['https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/15/3/picldR2s3.jpg', 'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/15/3/picjFhmLD.jpg', 'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/15/3/picqwMJR8.jpg'], 'RecipeCategory': 'Quick Breads', 'Keywords': ['Breads', '< 4 Hours', 'Easy', 'Oven'], 'RecipeIngredientQuantities': ['1 1/2', '2', '1/2', '1', '3'], 'RecipeIngredientPar

### Put the recipe into elasticsearch

In [5]:
from elasticsearch import Elasticsearch
import json
from bson import ObjectId
from elasticsearch import helpers

In [6]:
class ElasticIndexer:
    def __init__(self):
        self.es_client = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "azjt*71kG2zv+cVkTTD8"),
                    ca_certs="./http_ca.crt")
        self.mongo_client = pymongo.MongoClient("mongodb://localhost:27017")
        self.mongo_collection = self.mongo_client['food']['cleaned_recipe']

    def run_indexer(self):
        self.es_client.indices.create(index='recipe', ignore=400)
        self.es_client.indices.delete(index='recipe', ignore=[400, 404])
        actions = []
        
        for doc in self.mongo_collection.find({}):
            _id = str(doc['_id'])
            del doc["_id"]
            action = {
                "_index": 'recipe',
                "_source": doc
            }
            actions.append(action)

        response = helpers.bulk(self.es_client, actions)
        return response


In [8]:
es = ElasticIndexer()
es.run_indexer()

  self.es_client.indices.create(index='recipe', ignore=400)
  self.es_client.indices.delete(index='recipe', ignore=[400, 404])


(15462, [])

In [9]:
es_client = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "azjt*71kG2zv+cVkTTD8"),
                    ca_certs="./http_ca.crt")

In [12]:
query = {
    "query_string": {
        "query": "Sweet Rolls Easy"
    }
}

results = es_client.search(index='recipe', query=query)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results_df

Unnamed: 0,Name,Keywords,Ingredient,score
0,Breakfast Egg Rolls,"[Brunch, < 60 Mins]","[onion, green pepper, eggs, cheese, water, mil...",13.042132
1,Honey Orange Butter,"[Beginner Cook, Inexpensive, < 15 Mins, Low Pr...","[honey, butter, margarine]",12.088933
2,Easy Crunchy Toffee,"[Easy, Beginner Cook, For Large Groups]","[butter, chocolate chips, brown sugar]",11.833335
3,The Best Sweet Potato Casserole,"[Beginner Cook, Inexpensive, Potato, Kid Frien...","[vanilla, butter, salt, brown sugar, eggs, wal...",10.93132
4,Sweet Kielbasa,"[Potluck, For Large Groups, Meat]","[kielbasa, brown sugar, water, apple cider, on...",10.792521
5,Hawaiian Bread Ham &amp; Cheese Rolls,[For Large Groups],"[butter, prepared yellow mustard, deli ham, sw...",10.75276
6,Sweet Potato Pancakes With Caramel Sauce,"[< 30 Mins, Potato, Yam/Sweet Potato, Vegetabl...","[butter, canned sweet potatoes, ground allspic...",10.670794
7,Sweet Rolls,"[Sweet, Kid Friendly, Thanksgiving, For Large ...","[active dry yeast, butter, ground cinnamon, sa...",10.655209
8,Mini Cajun Burgers With Easy R&eacute;moulade,"[Cajun, < 30 Mins]","[creole mustard, sausage, green leaf lettuce, ...",10.290481
9,Coconut Shrimp With Guava Sweet and Sour Sauce,"[< 30 Mins, Summer, Polynesian, Fruit, Weeknight]","[coconut flakes, panko breadcrumbs, white vine...",10.101153


### Recommendations

In [105]:
# Get top 5 similar to the selected dish

query = { 
   "more_like_this":{
    # Breakfast Eggcake ID:'65d5e4928598535be43ec668' 
    "fields":["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],"like":[{"_id": '65d5e4928598535be43ec668'}],"min_term_freq":1,"min_doc_freq":5,"max_query_terms":20
    }
}

results = es_client.search(index='recipe', query=query, size=5)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results_df

Unnamed: 0,Name,Keywords,Ingredient,score


### Personalized by interestedCategory

In [85]:
users = db['users']

user = users.find({'_id': ObjectId('65f017e507c1a9b66d854764')})
user_df = pd.DataFrame(user)

In [86]:
user_df['interestedCategory'][0]

KeyError: 'interestedCategory'

In [106]:
query_term="pancoke"
query={
            
                "query":{
                    "dis_max": {
                        "queries": [
                            { "match": { "Name": query_term }},
                            { "match": { "RecipeIngredientParts": query_term }},
                            { "match": { "RecipeInstructions": query_term }},
                        ],
                        "tie_breaker": 0.3
                    },
                },
                
            
        }

results = es_client.search(index='recipe', body=query, size=10000)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results

  results = es_client.search(index='recipe', body=query, size=10000)


ObjectApiResponse({'took': 5, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}})

### Score by weight

In [None]:
query = {
    "dis_max": {
        "queries": [
            {
                "more_like_this": {
                    "fields": ["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],
                    # Anzac Biscuits
                    "like": [{"_id": "65d5e48d8598535be43eb8f3"}],
                    "min_term_freq": 1,
                    "min_doc_freq": 5,
                    "max_query_terms": 20,
                    "boost": 0.2
                }
            },
            {
                "more_like_this": {
                    "fields": ["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],
                    # TOFU KEBAB
                    "like": [{"_id": "65d5e48d8598535be43eb8f1"}],
                    "min_term_freq": 1,
                    "min_doc_freq": 5,
                    "max_query_terms": 20,
                    "boost": 0.5
                }
            }
            #   More Recipe in bookmarks 
            #   Does boost make sense or not?
        ]
    }
}


results = es_client.search(index='recipe', query=query, size=15)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results_df

Unnamed: 0,Name,Keywords,Ingredient,score
0,Healthy Vegetable Tofu Kebabs,"[< 30 Mins, Peppers, Vegetable, Beans]","[firm tofu, salt, green pepper, button mushroo...",32.092094
1,Tofu and Bok Choy Stir Fry,"[Vegan, < 15 Mins, Chinese, Vegetable, Low Cho...","[bok choy, low sodium soy sauce, extra firm to...",24.713129
2,Tasty Tofu Veggie Scramble,"[Vegan, < 30 Mins, Lactose Free, Free Of..., E...","[garlic powder, green bell pepper, low sodium ...",23.104267
3,Pf Chang's Tofu Lettuce Wraps,"[Vegan, Asian, Beans, < 60 Mins]","[garlic, brown sugar, extra firm tofu, water, ...",23.036478
4,Spicy &quot;Barbecued&quot; Tofu Burgers,"[Soy/Tofu, Inexpensive, Broil/Grill, Vegetable...","[pickle, barbecue sauce, lemon-pepper seasonin...",22.06025
5,Barbecue Tofu,"[Beginner Cook, Inexpensive, Brunch, Vegan, Su...","[firm tofu, onion powder, blackstrap molasses,...",21.835814
6,Vegan Baked Tofu Strips,"[Vegan, Microwave, Free Of..., < 60 Mins, Potl...","[panko breadcrumbs, sea salt, poultry seasonin...",21.595547
7,"Sydney Broccoli, Red Pepper &amp; Tofu Stir Fr...","[Soy/Tofu, From Scratch, Peppers, < 30 Mins, S...","[extra firm tofu, scallions, reduced sodium so...",19.839397
8,Chinese Bourbon Tofu,"[Soy/Tofu, Vegan, Chinese, Low Cholesterol, As...","[garlic, low sodium soy sauce, brown sugar, ex...",19.690624
9,Tofu Fish (Or Not-Fish),"[Beginner Cook, Inexpensive, < 30 Mins, Free O...","[firm tofu, nori, paprika, wheat germ, polenta...",19.055569


### Not interested topic?

In [None]:
un_prefer = set(user['uninterestedCategory'])   # boost down
prefer = set(user['interestedCategory'])  - un_prefer # boost up

query = {
    "dis_max": {
        "queries": [
            {
                "boosting": {
                    "positive" : {
                        "match": {"Keywords": ' '.join(prefer)},
                        # "term": {"_id": "65d5e49a8598535be43ee02b"}
                    },
                    "negative": {
                        "match": {"Keywords": ' '.join(un_prefer)}
                    },
                    "negative_boost": 0.5
                }
            },
            {
                "more_like_this": {
                    "fields": ["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],
                    # TOFU KEBAB
                    "like": [{"_id": "65d5e48d8598535be43eb8f1"}],
                    "min_term_freq": 1,
                    "min_doc_freq": 5,
                    "max_query_terms": 20,
                    "boost": 1.1
                }
            },
                        {
                "more_like_this": {
                    "fields": ["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],
                    # Anzac Biscuits
                    "like": [{"_id": "65d5e48d8598535be43eb8f3"}],
                    "min_term_freq": 1,
                    "min_doc_freq": 5,
                    "max_query_terms": 20,
                    "boost": 1.1
                }
            },
            # {
            #     "query_string": {
            #         "query": ' '.join(prefer),
            #         "boost": 1.2
            #     }
            # }
            # {
            #     "query_string": {
            #         "query": ' '.join(un_prefer),
            #         "boost": -1
            #     }                
            # }
        ]
    }
}

results = es_client.search(index='recipe', query=query, size=100)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score']).sort_values(by=['score'], ascending=True)

# results_df
print('Unprefer recipe score')
print(results_df.head().to_markdown())
print('Prefer recipe score')
print(results_df.tail().to_markdown())

Unprefer recipe score
|    | Name                                    | Keywords                                                                                                                                                      | Ingredient                                                                                                                                                                                                  |   score |
|---:|:----------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|
| 99 | Sultana Bran Cookies                    | ['Beginner Cook', '< 30 Mins', 'Kid Friendly', 'Cookie & Brownie', 'Easy']     

In [None]:
user = users.find_one({'_id': ObjectId('65f017e507c1a9b66d854764')})
queries = []

prefer = set(user['interestedCategory']) # boost up

un_prefer = set(user['uninterestedCategory']) -  prefer # boost down

if len(user['interestedCategory']) > 0:
    queries.append(
        {
            
            "boosting": {
                "positive" : {
                    "match": {"Keywords": ' '.join(prefer)},
                },
                "negative": {
                    "match": {"Keywords": ' '.join(un_prefer)}
                },
                "negative_boost": 0.5
            }
        
    })

else:
    queries.append({
        "function_score": {
            "query": {"match_all": {}},
            "random_score": {}
        }
    })

if len(user['interestedRecipe']) > 0:
    like = []
    for recipe in user['interestedRecipe']:
        like.append({'_id': recipe})
    queries.append({
        'more_like_this': {
            "fields": ["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],
            "like": like, "min_term_freq": 1, "min_doc_freq": 5, "max_query_terms": 20,
            'boost': 1
        }
    })


query = {"dis_max": {
        "queries": queries
}}

query

{'dis_max': {'queries': [{'boosting': {'positive': {'match': {'Keywords': 'Oven Beginner Cook Easy'}},
     'negative': {'match': {'Keywords': 'Australian Freezer'}},
     'negative_boost': 0.5}},
   {'more_like_this': {'fields': ['Name',
      'Keywords',
      'RecipeIngredientParts',
      'RecipeCategory'],
     'like': [{'_id': '65d5e49a8598535be43ee02b'},
      {'_id': '65d5e49a8598535be43ee18a'},
      {'_id': '65d5e48f8598535be43ebdc9'},
      {'_id': '65d5e4918598535be43ec36a'},
      {'_id': '65d5e48e8598535be43eb9f0'},
      {'_id': '65d5e4998598535be43edcd9'},
      {'_id': '65d5e49f8598535be43ef357'},
      {'_id': '65d5e49d8598535be43eed42'},
      {'_id': '65d5e4988598535be43eda42'}],
     'min_term_freq': 1,
     'min_doc_freq': 5,
     'max_query_terms': 20,
     'boost': 1}}]}}

In [None]:
results = es_client.search(index='recipe', query=query, size=100)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score']).sort_values(by=['score'], ascending=False)

results_df

Unnamed: 0,Name,Keywords,Ingredient,score
0,Sammy's Beef and Sausage Meatloaf,"[Easy, Oven, Beginner Cook, < 4 Hours]","[garlic powder, water, ketchup, seasoning salt...",7.335291
2,Crab Casserole (Chinese Buffet),"[Easy, Oven, Beginner Cook, < 60 Mins]","[butter, salt, onions, parsley, lump crabmeat,...",7.335291
3,Bea's Banana Bread,"[Easy, Oven, Beginner Cook, < 4 Hours]","[bananas, Miracle Whip, egg, baking soda, flou...",7.335291
4,Salmon Puffs,"[Easy, Oven, Beginner Cook, < 60 Mins]","[salmon, garlic powder, salt, eggs, parsley, l...",7.335291
5,Cod Vera Cruz,"[Easy, Oven, Beginner Cook, < 30 Mins]","[garlic powder, condensed chicken broth, dried...",7.335291
...,...,...,...,...
91,Cinn-ful Fudgy Rum Pudding Cake,"[Beginner Cook, Kid Friendly, Low Protein, Eas...","[butter, salt, cocoa powder, ground cloves, va...",5.728935
90,Spoon Rolls,"[Beginner Cook, Inexpensive, Brunch, < 30 Mins...","[dry yeast, margarine, egg, self rising flour,...",5.728935
89,Creamy Lemon Meringue Pie,"[Beginner Cook, < 30 Mins, Spring, Summer, Fru...","[eggs, cream of tartar, lemon juice, sweetened...",5.728935
87,BBQ Muffins,"[Beginner Cook, < 30 Mins, Easy, Meat, Kid Fri...","[brown sugar, cider vinegar, chili powder, ket...",5.728935


### Ranking evaluation 

In [87]:
query = {
   "more_like_this":{
        "fields":["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],"like":[{"_id": "65d5e4928598535be43ec8fc"}],"min_term_freq":1,"min_doc_freq":5,"max_query_terms":20
    }
}

ratings = [
     { "_index": "recipe", "_id": "65d5e4928598535be43ec668", "rating": 0 },
     { "_index": "recipe", "_id": "65d5e48e8598535be43eba08", "rating": 3 },
     { "_index": "recipe", "_id": "65d5e4918598535be43ec3eb", "rating": 0 },
     { "_index": "recipe", "_id": "65d5e4928598535be43ec8fc", "rating": 5 },
     { "_index": "recipe", "_id": "65d5e49a8598535be43ee0c9", "rating": 2 },
     { "_index": "recipe", "_id": "65d5e49a8598535be43ee26f", "rating": 1 },
     { "_index": "recipe", "_id": "65d5e49b8598535be43ee65e", "rating": 4 },
     { "_index": "recipe", "_id": "65d5e49b8598535be43ee374", "rating": 5 },
]

requests = [
    { "id": "Query_1", "request": {"query": query} , 'ratings': ratings}
]

metric =  {
    "dcg": {
      "k": 20,
      "normalize": False
    }
}

results = es_client.rank_eval(index='recipes', requests=requests, metric= metric)
# mertic_score_df = pd.DataFrame()

# mertic_score_df
results['metric_score']

'NaN'

In [88]:
results_df = pd.DataFrame([[hit['hit']["_id"], hit['hit']["_score"], hit["rating"]] for hit in results['details']['Query_1']['hits']], columns=['ID', 'Score', 'Rating'])

results_df

KeyError: 'Query_1'