In [1]:
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import duckdb

In [9]:
# Connect to the DuckDB database
con = duckdb.connect('recipes.duckdb')

# Load the recipes data
recipes_df = con.execute("SELECT * FROM recipes").fetchdf()

# Preprocess ingredients
def preprocess_ingredients(ingredients):
    if not ingredients:
        return []
    ingredients = ingredients.lower()
    ingredients = re.sub(r'[^\w\s]', '', ingredients)
    return ingredients.split()

recipes_df['ingredients'] = recipes_df['ingredients'].apply(preprocess_ingredients)

# Vectorize using TF-IDF
recipes_df['ingredients_joined'] = recipes_df['ingredients'].apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer()
tfidf_encoded = tfidf_vectorizer.fit_transform(recipes_df['ingredients_joined'])

# Clustering with K-Means
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(tfidf_encoded)

# Add cluster labels to the DataFrame
recipes_df['cluster'] = clusters

# Display the resulting DataFrame
recipes_df.head()


Unnamed: 0,recipe_title,url,record_health,vote_count,rating,description,cuisine,course,diet,prep_time,cook_time,ingredients,instructions,author,tags,category,ingredients_joined,cluster
0,Roasted Peppers And Mushroom Tortilla Pizza Re...,https://www.archanaskitchen.com/roasted-pepper...,good,434,4.958525,is a quicker version pizza to satisfy your cr...,Mexican,Dinner,Vegetarian,15 M,15 M,"[tortillasextra, virgin, olive, oilgarlicmozza...",To begin making the Roasted Peppers And Mushro...,Divya Shivaraman,Party Food Recipes|Tea Party Recipes|Mushroom ...,Pizza Recipes,tortillasextra virgin olive oilgarlicmozzarell...,0
1,Thakkali Gotsu Recipe | Thakkali Curry | Spicy...,https://www.archanaskitchen.com/tomato-gotsu-r...,good,3423,4.932223,also known as the is a quick and easy to ma...,South Indian Recipes,Lunch,Vegetarian,10 M,20 M,"[sesame, gingelly, oilmustard, seeds, rai, kad...",To begin making Tomato Gotsu Recipe/ Thakkali ...,Archana Doshi,Vegetarian Recipes|Tomato Recipes|South Indian...,Indian Curry Recipes,sesame gingelly oilmustard seeds rai kadugucur...,3
2,Spicy Grilled Pineapple Salsa Recipe,https://www.archanaskitchen.com/spicy-grilled-...,good,2091,4.945959,Spicy Grilled Pineapple Salsa is a simple reci...,Mexican,Side Dish,Vegetarian,10 M,0 M,"[extra, virgin, olive, oilpineapplewhite, onio...",To begin making the Spicy Grilled Pineapple Sa...,Archana's Kitchen,Party Starter & Appetizer Recipes|Pineapple Re...,Mexican Recipes,extra virgin olive oilpineapplewhite onionred ...,0
3,Karwar Style Dali Thoy Recipe - Toor dal Curry,https://www.archanaskitchen.com/dali-thoy-reci...,good,990,4.888889,The is a quintessential of Konkani dish whic...,Coastal Karnataka,Side Dish,High Protein Vegetarian,5 M,20 M,"[arhar, dal, split, toor, dalturmeric, powder,...",To prepare Karwar Style Dali Thoy Recipe (Toor...,Jyothi Rajesh,Side Dish Recipes|South Indian Recipes|Indian ...,Indian Curry Recipes,arhar dal split toor dalturmeric powder haldis...,4
4,Rajma Kofta In Milk And Poppy Seed Gravy Recipe,https://www.archanaskitchen.com/rajma-kofta-in...,good,345,4.828986,Koftas are traditional Indian recipes mostly w...,North Indian Recipes,Side Dish,High Protein Vegetarian,20 M,30 M,"[rajma, large, kidney, beanscashew, nutssultan...",To begin making Rajma Kofta In Milk And Poppy ...,RUBY PATHAK,Side Dish Recipes|Indian Lunch Recipes|Office ...,Kofta Recipes,rajma large kidney beanscashew nutssultana rai...,1


This code loads our recipes and preprocesses the ingredients. With help of using TF-IDF they are then vectorized and K-Means is applied for clustering in order to group similar recipes together.