# Ingredient-Based Clustering of Pinoy Dishes


## Data loading and preview

In [1]:
import json

In [2]:
with open("panlasang_pinoy_recipes.json") as file:
    recipes = json.load(file)

len(recipes)

1873

In [3]:
recipes[0]

{'link': 'https://panlasangpinoy.com/fried-eggplant-crispy-fritters/',
 'name': 'Fried Eggplant (Crispy Fritters / Talong Okoy)',
 'ingredients': ['Chinese eggplants',
  'eggs',
  'green onions',
  'cornstarch',
  'all-purpose flour',
  'baking powder',
  'salt',
  'ground black pepper',
  'water',
  'cooking oil',
  'white vinegar',
  'soy sauce',
  'red onion',
  'green onion',
  'Thai chili pepper',
  'garlic',
  'sugar',
  'salt',
  'ground black pepper'],
 'instructions': ['Grate the eggplants and arrange in a bowl. Sprinkle 4 teaspoons of salt. Toss until the eggplants are evenly coated. Let it stay for 10 minutes.',
  'Add 2 cups of water into the bowl. Wash the eggplants to get rid of the salt. Grab the eggplants with your hand and squeeze until the water drains completely. You can also put this in a cheesecloth and wring it until the water drains out.',
  'Combine flour, cornstarch, and baking powder in a bowl. Mix well and then set aside.',
  'Beat the eggs in a large bowl. A

## Preprocessing

In [4]:
import spacy
from tqdm import tqdm

In [5]:
nlp = spacy.load("en_core_web_lg")

In [6]:
def clean_ingredient(ingredient):
    doc = nlp(ingredient)
    lemmas = [
        token.lemma_ for token in doc if token.is_alpha and not token.is_stop
    ]
    return " ".join(lemmas)

In [7]:
ingredients_by_recipe = []

for recipe in tqdm(recipes):
    ingredients = []
    for ingredient in recipe["ingredients"]:
        ingredients.append(clean_ingredient(ingredient))
    ingredients_by_recipe.append(ingredients)

ingredients_by_recipe[0]

100%|██████████| 1873/1873 [01:04<00:00, 29.15it/s]


['chinese eggplant',
 'egg',
 'green onion',
 'cornstarch',
 'purpose flour',
 'bake powder',
 'salt',
 'ground black pepper',
 'water',
 'cook oil',
 'white vinegar',
 'soy sauce',
 'red onion',
 'green onion',
 'thai chili pepper',
 'garlic',
 'sugar',
 'salt',
 'ground black pepper']

## Vectorization

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(
    " ".join(ingredients) for ingredients in ingredients_by_recipe
)

## Dimensionality reduction

In [10]:
from sklearn.decomposition import TruncatedSVD

In [11]:
svd = TruncatedSVD(n_components=min(20, vectors.shape[1] - 1))
reduced_data = svd.fit_transform(vectors)

## Clustering

In [12]:
from sklearn.cluster import AgglomerativeClustering

In [13]:
optimal_clusters = 7  # ???: How to determine the optimal number of clusters?
clustering = AgglomerativeClustering(
    n_clusters=optimal_clusters,
    linkage="ward",
)
cluster_labels = clustering.fit_predict(reduced_data)

## Visualization

In [14]:
from sklearn.decomposition import PCA
import plotly.graph_objects as go

In [15]:
pca = PCA(n_components=2)
plot_data = pca.fit_transform(reduced_data)

fig = go.Figure()

recipe_names = [recipe["name"] for recipe in recipes]

for i in range(optimal_clusters):
    cluster_data = plot_data[cluster_labels == i]
    fig.add_trace(
        go.Scatter(
            x=cluster_data[:, 0],
            y=cluster_data[:, 1],
            mode="markers",
            marker=dict(
                size=8,
                opacity=0.8,
            ),
            name=f"Cluster {i}",
            text=[
                recipe_names[j]
                for j in range(len(recipe_names))
                if cluster_labels[j] == i
            ],  # Display recipe names on hover
        )
    )

fig.update_layout(title_text="Visualization of Recipe Clusters (PCA)")
fig.show()