# Ingredient-Based Clustering of Pinoy Dishes


## Data loading and preview


In [1]:
import json

with open("panlasang_pinoy_recipes.json") as file:
    recipes = json.load(file)

recipes[0]

{'link': 'https://panlasangpinoy.com/fried-eggplant-crispy-fritters/',
 'name': 'Fried Eggplant (Crispy Fritters / Talong Okoy)',
 'ingredients': ['Chinese eggplants',
  'eggs',
  'green onions',
  'cornstarch',
  'all-purpose flour',
  'baking powder',
  'salt',
  'ground black pepper',
  'water',
  'cooking oil',
  'white vinegar',
  'soy sauce',
  'red onion',
  'green onion',
  'Thai chili pepper',
  'garlic',
  'sugar',
  'salt',
  'ground black pepper'],
 'instructions': ['Grate the eggplants and arrange in a bowl. Sprinkle 4 teaspoons of salt. Toss until the eggplants are evenly coated. Let it stay for 10 minutes.',
  'Add 2 cups of water into the bowl. Wash the eggplants to get rid of the salt. Grab the eggplants with your hand and squeeze until the water drains completely. You can also put this in a cheesecloth and wring it until the water drains out.',
  'Combine flour, cornstarch, and baking powder in a bowl. Mix well and then set aside.',
  'Beat the eggs in a large bowl. A

In [2]:
len(recipes)  # 1873

1873

In [3]:
def get_all_unique_ingredients(recipes):
    ingredients = set()
    for recipe in recipes:
        for ingredient in recipe["ingredients"]:
            ingredients.add(ingredient)
    return list(ingredients)


all_ingredients = get_all_unique_ingredients(recipes)
len(all_ingredients)

3340

In [4]:
with open("raw_ingredients.txt", "w", encoding="utf-8") as file:
    file.writelines("\n".join(sorted(all_ingredients)))

## Preprocessing


In [5]:
import spacy
from tabulate import tabulate
from tqdm import tqdm

nlp = spacy.load("en_core_web_lg")

In [6]:
def display_str_as_doc(text):
    """Display a spaCy doc in a table format."""
    rows = []
    for token in nlp(text):
        row = [
            token.text,
            token.lemma_,
            token.pos_,
            token.tag_,
            token.dep_,
            token.is_alpha,
        ]
        rows.append(row)
    headers = ["text", "lemma", "pos", "tag", "dep", "is_alpha"]
    print(tabulate(rows, headers=headers))


SAMPLE_INGREDIENT = "14 oz. bean curd, sliced into 1/2 inch thick flat pieces"
display_str_as_doc(SAMPLE_INGREDIENT)

text    lemma    pos    tag    dep       is_alpha
------  -------  -----  -----  --------  ----------
14      14       NUM    CD     nummod    False
oz      oz       NOUN   NN     compound  True
.       .        PUNCT  .      punct     False
bean    bean     PROPN  NNP    compound  True
curd    curd     PROPN  NNP    ROOT      True
,       ,        PUNCT  ,      punct     False
sliced  slice    VERB   VBN    acl       True
into    into     ADP    IN     prep      True
1/2     1/2      NUM    CD     nummod    False
inch    inch     NOUN   NN     npadvmod  True
thick   thick    ADJ    JJ     amod      True
flat    flat     ADJ    JJ     amod      True
pieces  piece    NOUN   NNS    pobj      True


In [7]:
def lemmatize_valid_nouns(ingredient):
    """Lemmatize valid nouns in a spaCy doc."""
    doc = nlp(ingredient)
    clean_tokens = []
    for token in doc:
        if token.is_alpha and token.pos_ in ["NOUN", "PROPN"]:
            clean_tokens.append(token.lemma_)
    return " ".join(clean_tokens)


lemmatized = lemmatize_valid_nouns(SAMPLE_INGREDIENT)
display_str_as_doc(lemmatized)

text    lemma    pos    tag    dep       is_alpha
------  -------  -----  -----  --------  ----------
oz      oz       NOUN   NN     compound  True
bean    bean     NOUN   NN     compound  True
curd    curd     NOUN   NN     compound  True
inch    inch     NOUN   NN     compound  True
piece   piece    NOUN   NN     ROOT      True


In [8]:
with open(f"culinary_stopwords.txt") as file:
    culinary_stopwords = [line.strip() for line in file]


def filter_stopwords(terms):
    """Filter out culinary stopwords from a string of terms."""
    clean_terms = []
    for term in terms.split():
        if term not in culinary_stopwords:
            clean_terms.append(term)
    return " ".join(clean_terms)


filtered = filter_stopwords(lemmatized)
display_str_as_doc(filtered)

text    lemma    pos    tag    dep       is_alpha
------  -------  -----  -----  --------  ----------
bean    bean     PROPN  NNP    compound  True
curd    curd     NOUN   NN     ROOT      True


In [9]:
with open("ingredient_synonyms.json") as file:
    ingredient_synonyms = json.load(file)


def handle_synonyms(ingredient):
    """Replace ingredient synonyms with a common name."""
    return ingredient_synonyms.get(ingredient, ingredient)

In [10]:
def preprocess_ingredient(ingredient):
    """Preprocess an ingredient string to extract key terms."""
    # Remove any text in parentheses
    ingredient = ingredient.split("(")[0]

    # Get the first item in a list of alternatives
    if " or " in ingredient:
        ingredient = ingredient.split(" or ")[0]

    lowered = ingredient.lower().strip()
    lemmatized = lemmatize_valid_nouns(lowered)
    filtered = filter_stopwords(lemmatized)
    common_name = handle_synonyms(filtered)
    return common_name


clean_ingredient = preprocess_ingredient(SAMPLE_INGREDIENT)
clean_ingredient

'bean curd'

In [11]:
ingredients_by_recipe = []

for recipe in tqdm(recipes):
    ingredients = set()
    for terms in recipe["ingredients"]:
        clean_terms = preprocess_ingredient(terms)
        if clean_terms:
            ingredients.add(clean_terms)
    ingredients_by_recipe.append(list(ingredients))

100%|██████████| 1873/1873 [01:19<00:00, 23.68it/s]


In [12]:
ingredients_by_recipe[0]

['cornstarch',
 'salt',
 'flour',
 'oil',
 'soy sauce',
 'onion',
 'egg',
 'sugar',
 'eggplant',
 'chili pepper',
 'vinegar']

## Vectorization


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(
    " ".join(ingredients) for ingredients in ingredients_by_recipe
)

## Dimensionality reduction


In [15]:
from sklearn.decomposition import TruncatedSVD

In [16]:
svd = TruncatedSVD(n_components=min(20, vectors.shape[1] - 1))
reduced_data = svd.fit_transform(vectors)

## Clustering


In [17]:
from sklearn.cluster import AgglomerativeClustering

In [18]:
optimal_clusters = 7  # ???: How to determine the optimal number of clusters?
clustering = AgglomerativeClustering(
    n_clusters=optimal_clusters,
    linkage="ward",
)
cluster_labels = clustering.fit_predict(reduced_data)

## Visualization


In [19]:
from sklearn.decomposition import PCA
import plotly.graph_objects as go

In [20]:
pca = PCA(n_components=2)
plot_data = pca.fit_transform(reduced_data)

fig = go.Figure()

recipe_names = [recipe["name"] for recipe in recipes]

for i in range(optimal_clusters):
    cluster_data = plot_data[cluster_labels == i]
    fig.add_trace(
        go.Scatter(
            x=cluster_data[:, 0],
            y=cluster_data[:, 1],
            mode="markers",
            marker=dict(
                size=8,
                opacity=0.8,
            ),
            name=f"Cluster {i}",
            text=[
                recipe_names[j]
                for j in range(len(recipe_names))
                if cluster_labels[j] == i
            ],  # Display recipe names on hover
        )
    )

fig.update_layout(title_text="Visualization of Recipe Clusters (PCA)")
fig.show()