In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv("data/final_df.csv")
df.shape

(32696, 50)

In [4]:
df['high_level_ingredients_str'] = df['high_level_ingredients'].apply(lambda x: ' '.join(eval(x)) if isinstance(x, str) else ' '.join(x))
df['features'] = df.apply(lambda row: f"{row['name']} {row['category']} {row['high_level_ingredients_str']}", axis=1)
df['features']

0        Simple Macaroni and Cheese main-dish elbow all...
1        Gourmet Mushroom Risotto main-dish chicken bro...
2        Dessert Crepes breakfast-and-brunch all - purp...
3        Pork Steaks meat-and-poultry soy sauce bunch g...
4        Chicken Parmesan world-cuisine bread crumbs al...
                               ...                        
32691    Spicy Deviled Eggs appetizers-and-snacks Worce...
32692    Nori Chips appetizers-and-snacks salt olive oi...
32693    Deep Fried Jalapeno Slices appetizers-and-snac...
32694    Jalapeno Hummus appetizers-and-snacks canned j...
32695    Easy Baked Zucchini Chips appetizers-and-snack...
Name: features, Length: 32696, dtype: object

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# vectorize features
tfidf = TfidfVectorizer()
features_tfidf = tfidf.fit_transform(df['features'])

# reduce dimensionality using PCA
pca = PCA(n_components=50)
features_pca = pca.fit_transform(features_tfidf.toarray())

In [6]:
# from sklearn.cluster import KMeans

# kmeans = KMeans(n_clusters=20, random_state=100) 
# df['cluster'] = kmeans.fit_predict(features_pca)

# df[['features', 'cluster']].head()

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Ensure columns do not get cut off when printing
pd.set_option('display.max_colwidth', None)

def find_top_k_similar_recipes(query, k=5):
    query_tfidf = tfidf.transform([query])

    similarities = cosine_similarity(query_tfidf, features_tfidf).flatten()

    top_k_indices = similarities.argsort()[-k:][::-1]

    return df.iloc[top_k_indices][['name', 'category', 'high_level_ingredients']], similarities[top_k_indices]

query = "chickpea stew without dairy"
top_k_recipes, scores = find_top_k_similar_recipes(query, k=5)

print(f"Query: {query}")
for i, (recipe, score) in enumerate(zip(top_k_recipes.iterrows(), scores)):
    index, row = recipe
    print(f"Rank {i+1}: {row['name']} (Category: {row['category']})")
    
print(top_k_recipes)

Query: chickpea stew without dairy
Rank 1: Italian Chickpea Bread (Category: bread)
Rank 2: Vegan Chickpea Curry without Coconut Milk (Category: world-cuisine)
Rank 3: Dairy-Free Vanilla Frosting (Category: desserts)
Rank 4: Roasted Garlic without Foil (Category: side-dish)
Rank 5: Dairy-Free Scalloped Potatoes (Category: side-dish)
                                            name       category  \
4430                      Italian Chickpea Bread          bread   
6174   Vegan Chickpea Curry without Coconut Milk  world-cuisine   
4744                 Dairy-Free Vanilla Frosting       desserts   
21540                Roasted Garlic without Foil      side-dish   
21769              Dairy-Free Scalloped Potatoes      side-dish   

                                                                                                                                           high_level_ingredients  
4430                                                           ['water', 'chickpea flour', 'p salt

In [8]:
def check_dietary_restrictions(ingredients, restrictions):
    # Common non-vegan ingredients
    non_vegan = ['milk', 'cheese', 'cream', 'butter', 'egg', 'honey', 'meat', 'chicken', 'beef', 'pork', 'fish']
    # Common non-vegetarian ingredients
    non_vegetarian = ['meat', 'chicken', 'beef', 'pork', 'fish']
    # Common gluten ingredients
    gluten = ['flour', 'bread', 'pasta', 'wheat', 'barley', 'rye']
    # Common dairy ingredients
    dairy = ['milk', 'cheese', 'cream', 'butter', 'yogurt']
    # Peanut ingredients
    peanut = ['peanut', 'peanut butter']
    
    ingredients_lower = [ing.lower() for ing in ingredients]
    
    for restriction in restrictions:
        if restriction.lower() == 'vegan':
            if any(ing in ingredients_lower for ing in non_vegan):
                return False
        elif restriction.lower() == 'vegetarian':
            if any(ing in ingredients_lower for ing in non_vegetarian):
                return False
        elif restriction.lower() == 'gluten free':
            if any(ing in ingredients_lower for ing in gluten):
                return False
        elif restriction.lower() == 'lactose-free':
            if any(ing in ingredients_lower for ing in dairy):
                return False
        elif restriction.lower() == 'peanut free':
            if any(ing in ingredients_lower for ing in peanut):
                return False
    return True

In [12]:
def check_ingredients_match(recipe_ingredients, have_ingredients, avoid_ingredients):
    recipe_ingredients_lower = [ing.lower() for ing in recipe_ingredients]
    have_ingredients_lower = [ing.lower() for ing in have_ingredients]
    avoid_ingredients_lower = [ing.lower() for ing in avoid_ingredients]
    
    # Check if recipe contains any ingredients to avoid
    if any(ing in recipe_ingredients_lower for ing in avoid_ingredients_lower):
        return False, 0
    
    # empty recipe ingredients case
    if len(recipe_ingredients) == 0:
        return True, 0
    
    # Calculate match score based on available ingredients
    matching_ingredients = sum(1 for ing in have_ingredients_lower if any(ing in recipe_ing for recipe_ing in recipe_ingredients_lower))
    match_score = matching_ingredients / len(recipe_ingredients)
    
    return True, match_score

In [None]:
def find_recipes_with_preferences(query, have_ingredients=[], avoid_ingredients=[], dietary_restrictions=[], k=5):
    query_tfidf = tfidf.transform([query])
    similarities = cosine_similarity(query_tfidf, features_tfidf).flatten()
    
    recipe_scores = []
    
    for idx, sim_score in enumerate(similarities):
        recipe_ingredients = eval(df.iloc[idx]['high_level_ingredients'])
        
        # check diet restrictions
        if dietary_restrictions and not check_dietary_restrictions(recipe_ingredients, dietary_restrictions):
            continue
        
        # check ingredients match
        valid_ingredients, ing_match_score = check_ingredients_match(recipe_ingredients, have_ingredients, avoid_ingredients)
        if not valid_ingredients:
            continue
        
        #combined score
        combined_score = 0.7 * sim_score + 0.3 * ing_match_score
        recipe_scores.append((idx, combined_score))
    
    recipe_scores.sort(key=lambda x: x[1], reverse=True)
    top_k_indices = [idx for idx, score in recipe_scores[:k]]
    top_k_scores = [score for idx, score in recipe_scores[:k]]
    
    return df.iloc[top_k_indices][['name', 'category', 'high_level_ingredients']], top_k_scores

In [None]:
query = "chickpea stew"
have_ingredients = ['chickpea', 'onion', 'garlic', 'tomato']
avoid_ingredients = ['mushroom', 'bell pepper']
dietary_restrictions = ['vegetarian', 'lactose-free']

top_recipes, scores = find_recipes_with_preferences(
    query,
    have_ingredients=have_ingredients,
    avoid_ingredients=avoid_ingredients,
    dietary_restrictions=dietary_restrictions,
    k=5
)

print("Query:", query)
print("Available ingredients:", have_ingredients)
print("Avoiding ingredients:", avoid_ingredients)
print("Dietary restrictions:", dietary_restrictions)
print("\nTop Recommendations:")
for i, (recipe, score) in enumerate(zip(top_recipes.iterrows(), scores)):
    index, row = recipe
    print(f"\nRank {i+1}: {row['name']} (Score: {score:.2f})")
    print(f"Category: {row['category']}")
    print(f"Ingredients: {row['high_level_ingredients']}")

Query: chickpea stew
Available ingredients: ['chickpea', 'onion', 'garlic', 'tomato']
Avoiding ingredients: ['mushroom', 'bell pepper']
Dietary restrictions: ['vegetarian', 'lactose-free']

Top Recommendations:

Rank 1: Italian Chickpea Bread (Score: 0.44)
Category: bread
Ingredients: ['water', 'chickpea flour', 'p salt', 'cooking spray', 'Italian seasoning', 'black pepper', 'oil']

Rank 2: Elsy's Chickpea Burger (Score: 0.35)
Category: main-dish
Ingredients: ['roughly   cilantro leaves', 'eggs', 'chickpea flour', 'chili powder', 'garlic', 'vegetable oil', 'dried chickpeas garbanzo beans', 'salt', 'cumin', 'water   cover']

Rank 3: Chickpea Salad (Score: 0.33)
Category: salad
Ingredients: ['red wine vinegar', 'garbanzo beans', 'onion', 'tomato', 'balsamic vinegar', 'cucumber']

Rank 4: Red Cabbage and Chickpea Salad (Score: 0.32)
Category: salad
Ingredients: ['onion', 'tomato', 'pepper', 'tahini salad', 'red cabbage', 'salt', 'chickpeas']

Rank 5: Mediterranean Chickpea Salad II (Score