# Ingredient-Based Clustering of Pinoy Dishes


## Data loading and preview


In [1]:
import os
import json

DATA_FOLDER = "data"

recipe_files = [
    file
    for file in os.listdir(DATA_FOLDER)
    if file.endswith("recipes.json")
]

recipes = []
for file in recipe_files:
    file_path = os.path.join(DATA_FOLDER, file)
    with open(file_path, encoding="utf-8") as f:
        recipe_data = json.load(f)
        recipes.extend(recipe_data)

recipes[0]

{'link': 'https://www.kawalingpinoy.com/cheese-cupcakes/',
 'name': 'Cheese Cupcakes',
 'ingredients': ['flour, sifted',
  'baking powder',
  'salt',
  'butter, softened',
  'sugar',
  'eggs',
  '(14 ounces) sweetened condensed milk',
  'quick-melt cheese, shredded',
  'cheddar cheese, shredded'],
 'instructions': ['Line muffin pan with cupcake liners.',
  'In a large bowl, combine flour, baking powder,\xa0and salt.',
  'In a medium bowl, beat butter, sugar, and eggs with an electric mixer until combined.',
  'Beginning and ending with flour mixture, add flour mixture in thirds and ½ of sweetened condensed milk in between (⅓ flour mixture, ½ condensed milk, ⅓ flour mixture, ½ condensed milk and ⅓ flour mixture) to egg mixture. Beat at low speed with every addition.',
  'Add quick-melt cheese and stir to combine.',
  'Using a scoop or spoon, fill each muffin hole up to ¾ full. Top with cheddar cheese.',
  'Bake in a 350 F oven for about 20 to 25 minutes or until a toothpick inserted com

In [2]:
len(recipes)

2815

In [3]:
def get_unique_ingredients(recipes):
    """Get a list of unique ingredients from a list of recipes."""
    ingredients = set()
    for recipe in recipes:
        if not recipe["ingredients"]:
            continue
        for ingredient in recipe["ingredients"]:
            ingredients.add(ingredient)
    return list(sorted(ingredients))


unique_ingredients = get_unique_ingredients(recipes)
len(unique_ingredients)

5024

In [4]:
with open("data/raw_ingredients.txt", "w", encoding="utf-8") as file:
    file.writelines("\n".join(unique_ingredients))

## Preprocessing


In [5]:
import spacy
from tabulate import tabulate
from tqdm import tqdm

In [6]:
nlp = spacy.load("en_core_web_lg")

In [7]:
non_empty_recipes = [
    recipe
    for recipe in recipes
    if recipe["ingredients"]
]

len(non_empty_recipes), len(recipes) - len(non_empty_recipes)

(2469, 346)

In [8]:
def display_tokens(text):
    """Display token attributes for a given string.""",
    rows = []
    for token in nlp(text):
        row = [
            token.text,
            token.lemma_,
            token.pos_,
            token.tag_,
            token.dep_,
            token.is_alpha,
            token.ent_type_
        ]
        rows.append(row)
    headers = [
        "text",
        "lemma",
        "pos",
        "tag",
        "dep",
        "is_alpha",
        "ent_type"
    ]
    print(tabulate(rows, headers=headers))


SAMPLE_INGREDIENT = "14 oz. bean curd, sliced into 1/2 inch thick flat pieces"
display_tokens(SAMPLE_INGREDIENT)

text    lemma    pos    tag    dep       is_alpha    ent_type
------  -------  -----  -----  --------  ----------  ----------
14      14       NUM    CD     nummod    False       QUANTITY
oz      oz       NOUN   NN     compound  True        QUANTITY
.       .        PUNCT  .      punct     False
bean    bean     PROPN  NNP    compound  True
curd    curd     PROPN  NNP    ROOT      True
,       ,        PUNCT  ,      punct     False
sliced  slice    VERB   VBN    acl       True
into    into     ADP    IN     prep      True
1/2     1/2      NUM    CD     nummod    False       QUANTITY
inch    inch     NOUN   NN     npadvmod  True        QUANTITY
thick   thick    ADJ    JJ     amod      True
flat    flat     ADJ    JJ     amod      True
pieces  piece    NOUN   NNS    pobj      True


In [9]:
def lemmatize_valid_nouns(ingredient):
    """Lemmatize valid nouns in a string.""",
    doc = nlp(ingredient)
    valid_lemmas = []
    for token in doc:
        if token.is_alpha and token.pos_ in ["NOUN", "PROPN"]:
            valid_lemmas.append(token.lemma_)
    return " ".join(valid_lemmas)


lemmatized = lemmatize_valid_nouns(SAMPLE_INGREDIENT)
display_tokens(lemmatized)

text    lemma    pos    tag    dep       is_alpha    ent_type
------  -------  -----  -----  --------  ----------  ----------
oz      oz       NOUN   NN     compound  True
bean    bean     NOUN   NN     compound  True
curd    curd     NOUN   NN     compound  True
inch    inch     NOUN   NN     compound  True
piece   piece    NOUN   NN     ROOT      True


In [10]:
with open(f"data/culinary_stopwords.txt") as file:
    culinary_stopwords = [line.strip() for line in file]


def filter_stopwords(ingredient):
    """Filter out culinary stopwords from a string of terms."""
    clean_terms = []
    for term in ingredient.split():
        if term not in culinary_stopwords:
            clean_terms.append(term)
    return " ".join(clean_terms)


filtered = filter_stopwords(lemmatized)
display_tokens(filtered)

text    lemma    pos    tag    dep       is_alpha    ent_type
------  -------  -----  -----  --------  ----------  ----------
bean    bean     PROPN  NNP    compound  True
curd    curd     NOUN   NN     ROOT      True


In [11]:
with open("data/ingredient_synonyms.json") as file:
    ingredient_synonyms = json.load(file)


def handle_synonym(ingredient):
    """Replace ingredient synonyms with a common name."""
    return ingredient_synonyms.get(ingredient, ingredient)

common_name = handle_synonym(filtered)
display_tokens(common_name)

text    lemma    pos    tag    dep    is_alpha    ent_type
------  -------  -----  -----  -----  ----------  ----------
bean    bean     NOUN   NN     ROOT   True


In [12]:
def preprocess_ingredient(ingredient):
    """Preprocess an ingredient string to extract key terms.""",
    key_text = ingredient.split("(")[0]
    key_text = key_text.split(" or ")[0]
    key_text = key_text.lower().strip()
    doc = nlp(key_text)

    clean_tokens = []
    for token in doc:
        if (
            token.is_alpha
            and token.pos_ in ["NOUN", "PROPN"]
            and token.lemma_ not in culinary_stopwords
        ):
            clean_tokens.append(token.lemma_)

    cleaned = " ".join(clean_tokens)
    return ingredient_synonyms.get(cleaned, cleaned)


clean_ingredient = preprocess_ingredient(SAMPLE_INGREDIENT)
clean_ingredient

'bean'

In [13]:
cleaned_recipes = non_empty_recipes.copy()

for i, recipe in enumerate(tqdm(cleaned_recipes)):
    ingredients = set()
    for ingredient in recipe["ingredients"]:
        clean_terms = preprocess_ingredient(ingredient)
        if clean_terms:
            ingredients.add(clean_terms)
    cleaned_recipes[i]["ingredients"] = list(sorted(ingredients))

100%|██████████| 2469/2469 [01:37<00:00, 25.40it/s]


In [14]:
import numpy as np

ingredient_counts = [len(recipe["ingredients"]) for recipe in cleaned_recipes]
min_ingredient = np.mean(ingredient_counts) * 0.8
cleaned_recipes = [
    recipe
    for recipe in cleaned_recipes
    if len(recipe["ingredients"]) > min_ingredient
]

len(cleaned_recipes)

1808

## Vectorization


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
embeddings = vectorizer.fit_transform(
    [" ".join(recipe["ingredients"]) for recipe in cleaned_recipes]
)

embeddings.shape

(1808, 751)

## Dimensionality reduction

In [16]:
from sklearn.decomposition import TruncatedSVD

dimensions_to_keep = 150
svd = TruncatedSVD(n_components=dimensions_to_keep)
reduced_embeddings = svd.fit_transform(embeddings)

reduced_embeddings.shape


(1808, 150)

## Clustering


In [17]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 7
clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
cluster_labels = clustering.fit_predict(reduced_embeddings)

## Visualization


In [18]:
import plotly.express as px
import pandas as pd

# Dimensionality reduction for visualization
dimensions_to_keep = 2  # Adjust this to 2 or 3
svd = TruncatedSVD(n_components=dimensions_to_keep)
reduced_embeddings = svd.fit_transform(reduced_embeddings)

ingredients_format = [
    "<br>".join(recipe["ingredients"]) for recipe in cleaned_recipes
]

df = pd.DataFrame(
    {
        "x": reduced_embeddings[:, 0],
        "y": reduced_embeddings[:, 1],
        # "z": reduced_embeddings[:, 2],
        "cluster": cluster_labels,
        "recipe_name": [recipe["name"] for recipe in cleaned_recipes],
        "ingredients": ingredients_format,
    }
)

fig = px.scatter(
    df,
    x="x",
    y="y",
    # z="z",
    color="cluster",
    hover_name="recipe_name",
    hover_data=["ingredients"],
    width=800,
    height=800,
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="Ingredient-Based Clustering of Filipino Dishes")

fig.show()