In [1]:
# prompt: install faiss

!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


In [2]:
%pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
!pip install keybert

In [4]:
# Get recipe dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
from pandas import DataFrame
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import faiss
from keybert import KeyBERT
import time

In [58]:
# Read Recipes and Filter by Calories_Per_Meal
recipes = pd.read_csv('/content/drive/My Drive/CS329_Project/scraped-07-05-21.csv') # make path name your own.

selected_recipes = []

# search key words for dietary restriction, name + recipe to see if a recipe is... options for dietary restriction
calories_per_meal = 800
caloric_multiplier = 0.2
caloric_deviation = calories_per_meal * caloric_multiplier
min_calories_per_meal, max_calories_per_meal = calories_per_meal - caloric_deviation, calories_per_meal + caloric_deviation

null_calories_count = recipes['calories'].isnull().sum()
# print(null_calories_count)
selected_recipes_df = recipes[(recipes['calories'] >= min_calories_per_meal) & (recipes['calories'] <= max_calories_per_meal)]
selected_recipes_df.reset_index(drop=True, inplace=True)

In [None]:
for index, recipe in selected_recipes_df.iterrows():
  recipe_info = recipe['summary'] + ' ' + recipe['name']
  if 'gluten' in recipe_info:
    print(recipe_info)

In [60]:
# For testing: To Select Random Recipes or Recipes in Order

def select_recipes(recipes: DataFrame, n: int, random_selection: bool) -> DataFrame:
  if n > len(recipes):
    raise ValueError("Choose smaller n")

  if random_selection:
    selected_recipes = recipes.sample(n=n, random_state=42)  # Use a fixed random state for reproducibility
  else:
    selected_recipes = recipes.head(n)
  return selected_recipes

recipe_count = 1000
random_selection = False
sample_recipes = select_recipes(selected_recipes_df, recipe_count, random_selection)
# print(sample_recipes)

In [61]:
RECIPE_KEYWORDS = [] # pre-calculate the keywords to make KeyBERT methods faster. GLOBAL
keybert_model = KeyBERT() # dont repeat instances

# Extract keywords for each recipe in 'first100'
for _, recipe in sample_recipes.iterrows():
    summary           = recipe['summary']
    title             = recipe['name']
    title_and_summary = title + " " + summary
    RECIPE_KEYWORDS.append(keybert_model.extract_keywords(title_and_summary))

In [15]:
# BERT AND COS

# Need to implement NER/Topic modeling first, both on the user's preferences and the filtered recipes, ideally it would speed up the process
# Sample BERT model that ranks recipes based on the user's preferences


from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

class RecipeRankerCos:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.model.eval()  # Set model to evaluation mode

    def encode(self, text):
        """
        Encodes a given text into embeddings using BERT.
        """
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Use the pooler output (representing te entire sentence) for simplicity
        return outputs.pooler_output

    def rank_recipes_by_taste_profile(self, recipes, taste_profile):
        """
        Ranks recipes by their cosine similarity to the user's taste profile.
        """
        # Convert the user's taste profile into an embedding
        taste_profile_embedding = self.encode(taste_profile)

        # Calculate similarity scores and rank recipes
        ranked_recipes = []
        for recipe in recipes.iterrows():
            recipe_embedding = self.encode(recipe[1]['summary'])
            similarity_score = cosine_similarity(taste_profile_embedding, recipe_embedding)
            ranked_recipes.append((recipe, similarity_score.item()))

        # Sort recipes based on similarity score
        ranked_recipes.sort(key=lambda x: x[1], reverse=True)



        # Return the sorted list of recipes
        return [recipe for recipe, score in ranked_recipes]

In [16]:
# BERT AND FAISS

class RecipeRankerFAISS:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')

        self.model.eval()  # Set model to evaluation mode

    def encode(self, text):
        """
        Encodes a given text into embeddings using BERT.
        """
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        # Use the pooler output (representing the entire sentence) for simplicity
        return outputs.pooler_output.numpy()

    def rank_recipes_by_taste_profile(self, recipes, taste_profile):
        """
        Ranks recipes by their cosine similarity to the user's taste profile.
        """
        # Convert the user's taste profile into an embedding
        taste_profile_embedding = self.encode(taste_profile)

        # Encode all recipes into embeddings
        recipe_embeddings = np.vstack([self.encode(summary) for summary in recipes['summary']])

        # Setup FAISS index
        d = taste_profile_embedding.shape[1]  # Dimension of the embeddings
        index = faiss.IndexFlatL2(d)
        index.add(recipe_embeddings)

        # Perform a search to find the most similar recipes
        k = 10  # Number of recipes to retrieve
        _, indices = index.search(taste_profile_embedding, k)

        # Retrieve the ranked recipes
        ranked_recipes = recipes.iloc[indices[0]]

        return ranked_recipes

In [42]:
from keybert import KeyBERT
from sklearn.metrics.pairwise import cosine_similarity

class RecipeRankerKeyBERT:
    def __init__(self):
        self.kw_model = KeyBERT()

    def rank_recipes_by_taste_profile(self, recipes, taste_profile):
        """
        Ranks recipes by their KeyBERT similarity to the user's taste profile.
        """
        ranked_indices = []
        # Extract keywords from the taste profile

        taste_keywords = self.kw_model.extract_keywords(taste_profile)
        key_scores = [score for _, score in taste_keywords]

        for index in range(len(RECIPE_KEYWORDS)):
          recipe_tuples = RECIPE_KEYWORDS[index]
          recipe_scores = [score for _, score in recipe_tuples] # extract scores
          if len(recipe_scores) >= 5:
            similarity_score = cosine_similarity([key_scores], [recipe_scores])[0][0]
            ranked_indices.append((index, similarity_score))

        ranked_indices.sort(key=lambda x: x[1], reverse=True)
        return [index for index, _ in ranked_indices]

In [18]:
import numpy as np
import faiss
from sklearn.metrics.pairwise import cosine_similarity

class RecipeRankerKeyBERT_FAISS:
    def __init__(self):
        self.kw_model = KeyBERT()

    def rank_recipes_by_taste_profile(self, recipes, taste_profile):
        """
        Ranks recipes by their KeyBERT similarity to the user's taste profile using FAISS.
        """

        # Encode the taste profile
        taste_keywords = self.kw_model.extract_keywords(taste_profile)
        taste_vector = np.array([score for _, score in taste_keywords], dtype=np.float32)
        # recipe_embeddings = np.vstack([np.array([score for _, score in self.kw_model.extract_keywords(summary)], dtype=np.float32) for summary in recipes['summary']])

        recipe_embeddings = []

        for index in range(len(RECIPE_KEYWORDS)):
          recipe_tuples = RECIPE_KEYWORDS[index]
          recipe_scores = [score for _, score in recipe_tuples] # extract scores
          if len(recipe_scores) >= 5:
            recipe_embeddings.append(recipe_scores)

        recipe_embeddings = np.array(recipe_embeddings, dtype=np.float32)

        # Setup FAISS index
        d = recipe_embeddings.shape[1]  # Dimension of the embeddings
        index = faiss.IndexFlatL2(d)
        index.add(recipe_embeddings)

        # Perform a search to find the most similar recipes
        k = 10  # Number of recipes to retrieve
        _, indices = index.search(np.expand_dims(taste_vector, axis=0), k)

        # Retrieve the ranked recipes
        ranked_indices = indices[0]

        # Return the sorted list of recipes
        return ranked_indices


In [50]:
from fuzzywuzzy import fuzz

class RecipeRankerKeyBERTfuzzy:
    def __init__(self):
        self.kw_model = KeyBERT()

    def rank_recipes_by_taste_profile(self, recipes, taste_profile) -> list:
        """
        Ranks recipes by their KeyBERT similarity to the user's taste profile.
        """

        similarity_scores = []

        taste_keywords = self.kw_model.extract_keywords(taste_profile)
        key_words  = [word for word, _ in taste_keywords]

        start = time.time()

        for index in range(len(RECIPE_KEYWORDS)):
          recipe_tuples = RECIPE_KEYWORDS[index]
          recipe_words = [word for word, _ in recipe_tuples] # extract words
          denom = len(key_words)

          if denom != 0: # avoid div by 0
            score = 0
            for key_word in key_words:
              score += fuzz.partial_ratio(key_word, recipe_words) / denom
            similarity_scores.append((index, score))

        end = time.time()
        print(f'Running Time: {end - start}')

        similarity_scores.sort(key=lambda x: x[1], reverse=True) # sort by highest mean
        # print(similarity_scores[:10])

        top_indices = [index for index, _ in similarity_scores]

        return top_indices


In [63]:
def top10(ranked_recipes):
  count = 0
  for index, recipe in ranked_recipes:
        print(recipe[0])
        # print(recipe[4])
        count +=1
        if count == 10:
          break

def top10_indices(ranked_indices):
  count = 0
  for index in ranked_indices:
    print(sample_recipes['name'][index])
    # print(sample_recipes['summary'][index])
    count += 1
    if count >= 10: break

In [52]:
taste_profile = "I want a dish that is Asian and has beef, vegetables, and rice"

ranker_KeyBERT_fuzzy = RecipeRankerKeyBERTfuzzy()
RR_KeyBERT_fuzzy_indices = ranker_KeyBERT_fuzzy.rank_recipes_by_taste_profile(sample_recipes, taste_profile)

Running Time: 0.9597389698028564


In [65]:
taste_keywords = keybert_model.extract_keywords(taste_profile)
key_words  = [word for word, _ in taste_keywords]

print(key_words)
print()
print('========= KEYBERT FUZZY =============')
print()
top10_indices(RR_KeyBERT_fuzzy_indices)

['rice', 'dish', 'asian', 'vegetables', 'beef']


Malaysian Beef Rendang
Italian Rice Balls
Baked Rice (Ross Fil-Forn)
Easy After Work Chicken Francaise
Corned Beef and Cabbage I
Authentic Seafood Paella
Easy Smoked Sausage Skillet
Best Bobotie
Pasta Verde
Chef John's Lasagna


In [37]:
# FUZZY TESTING for Best Method of fuzz string matching

recipe_words1 = ['rice', 'dish', 'asian', 'beef', 'vegetables']
recipe_words2 = ['oatmeal', 'cookies', 'raisins', 'moist', 'soft']
recipe_words3 = ['recipe', 'chicken', 'ingredient', 'baked', 'cooked']
recipe_words4 = ['cake', 'cupcakes', 'white', 'simplest', 'tasting']
recipe_words5 = ['banana', 'bread', 'recipe', 'joy', 'seconds']
recipe_words6 = ['tortellini', 'pesto', 'salads', 'salad', 'pasta']
recipe_words7 = ['rice', 'restaurant', 'peas', 'vegetables', 'carrots']

user_words    = ['rice', 'dish', 'asian', 'vegetables', 'beef']

denom = 25
total_score1, total_score2 = 0, 0
for word in user_words:
  for word2 in user_words:
    total_score1 += fuzz.partial_ratio(word, word2) / 5
for word in user_words:
  for word2 in recipe_words1:
    total_score2 += fuzz.partial_ratio(word, word2) / 5

print(f'Total Score 1: {total_score1}')
print(f'Total Score 2: {total_score2}')
print()

total_score1, total_score2 = 0, 0
for word in user_words:
  total_score1 += fuzz.partial_ratio(word, user_words)
for word in user_words:
  total_score2 += fuzz.partial_ratio(word, recipe_words1)

print(f'Total Score 1: {total_score1 / 5}')
print(f'Total Score 2: {total_score2 / 5}')
print()
# print(f'Ratio: {total_score / total_score2}')

total_score1, total_score2 = 0, 0

total_score1 = fuzz.partial_ratio(user_words, user_words)
total_score2 = fuzz.partial_ratio(user_words, recipe_words1)

print(f'Total Score 1: {total_score1}')
print(f'Total Score 2: {total_score2}')

# partial ratio:
# "rice"
# "riceeeesnenw"
# Ratio: 100

Total Score 1: 199.2
Total Score 2: 199.2

Total Score 1: 100.0
Total Score 2: 100.0

Total Score 1: 100
Total Score 2: 83
