In [None]:
import pandas as pd
import numpy as np


In [11]:
df = pd.read_csv("data/final_df.csv")
df.shape

(32696, 50)

In [12]:
df['high_level_ingredients_str'] = df['high_level_ingredients'].apply(lambda x: ' '.join(eval(x)) if isinstance(x, str) else ' '.join(x))
df['features'] = df.apply(lambda row: f"{row['name']} {row['category']} {row['high_level_ingredients_str']}", axis=1)
df['features']

0        Simple Macaroni and Cheese main-dish elbow all...
1        Gourmet Mushroom Risotto main-dish chicken bro...
2        Dessert Crepes breakfast-and-brunch all - purp...
3        Pork Steaks meat-and-poultry soy sauce bunch g...
4        Chicken Parmesan world-cuisine bread crumbs al...
                               ...                        
32691    Spicy Deviled Eggs appetizers-and-snacks Worce...
32692    Nori Chips appetizers-and-snacks salt olive oi...
32693    Deep Fried Jalapeno Slices appetizers-and-snac...
32694    Jalapeno Hummus appetizers-and-snacks canned j...
32695    Easy Baked Zucchini Chips appetizers-and-snack...
Name: features, Length: 32696, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# vectorize features
tfidf = TfidfVectorizer()
features_tfidf = tfidf.fit_transform(df['features'])

# reduce dimensionality using PCA
pca = PCA(n_components=50)
features_pca = pca.fit_transform(features_tfidf.toarray())

In [None]:
# from sklearn.cluster import KMeans

# kmeans = KMeans(n_clusters=20, random_state=100) 
# df['cluster'] = kmeans.fit_predict(features_pca)

# df[['features', 'cluster']].head()

Unnamed: 0,features,cluster
0,Simple Macaroni and Cheese main-dish elbow all...,13
1,Gourmet Mushroom Risotto main-dish chicken bro...,4
2,Dessert Crepes breakfast-and-brunch all - purp...,3
3,Pork Steaks meat-and-poultry soy sauce bunch g...,6
4,Chicken Parmesan world-cuisine bread crumbs al...,19


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

# Ensure columns do not get cut off when printing
pd.set_option('display.max_colwidth', None)

def find_top_k_similar_recipes(query, k=5):
    query_tfidf = tfidf.transform([query])

    similarities = cosine_similarity(query_tfidf, features_tfidf).flatten()

    top_k_indices = similarities.argsort()[-k:][::-1]

    return df.iloc[top_k_indices][['name', 'category', 'high_level_ingredients']], similarities[top_k_indices]

query = "chickpea stew without dairy"
top_k_recipes, scores = find_top_k_similar_recipes(query, k=5)

print(f"Query: {query}")
for i, (recipe, score) in enumerate(zip(top_k_recipes.iterrows(), scores)):
    index, row = recipe
    print(f"Rank {i+1}: {row['name']} (Category: {row['category']})")
    
print(top_k_recipes)

Query: chickpea stew without dairy
Rank 1: Italian Chickpea Bread (Category: bread)
Rank 2: Vegan Chickpea Curry without Coconut Milk (Category: world-cuisine)
Rank 3: Dairy-Free Vanilla Frosting (Category: desserts)
Rank 4: Roasted Garlic without Foil (Category: side-dish)
Rank 5: Dairy-Free Scalloped Potatoes (Category: side-dish)
                                            name       category  \
4430                      Italian Chickpea Bread          bread   
6174   Vegan Chickpea Curry without Coconut Milk  world-cuisine   
4744                 Dairy-Free Vanilla Frosting       desserts   
21540                Roasted Garlic without Foil      side-dish   
21769              Dairy-Free Scalloped Potatoes      side-dish   

                                                                                                                                           high_level_ingredients  
4430                                                           ['water', 'chickpea flour', 'p salt