# Ingredient-Based Clustering of Pinoy Dishes


## Data loading and preview


In [None]:
import json

with open("panlasang_pinoy_recipes.json") as file:
    recipes = json.load(file)

recipes[0]

In [None]:
len(recipes)  # 1873

In [None]:
def get_all_unique_ingredients(recipes):
    ingredients = set()
    for recipe in recipes:
        for ingredient in recipe["ingredients"]:
            ingredients.add(ingredient)
    return list(ingredients)


all_ingredients = get_all_unique_ingredients(recipes)
len(all_ingredients)

In [None]:
with open("raw_ingredients.txt", "w", encoding="utf-8") as file:
    file.writelines("\n".join(sorted(all_ingredients)))

## Preprocessing


In [None]:
import spacy
from tabulate import tabulate
from tqdm import tqdm

nlp = spacy.load("en_core_web_lg")

In [None]:
def display_str_as_doc(text):
    """Display a spaCy doc in a table format."""
    rows = []
    for token in nlp(text):
        row = [
            token.text,
            token.lemma_,
            token.pos_,
            token.tag_,
            token.dep_,
            token.is_alpha,
        ]
        rows.append(row)
    headers = ["text", "lemma", "pos", "tag", "dep", "is_alpha"]
    print(tabulate(rows, headers=headers))


SAMPLE_INGREDIENT = "14 oz. bean curd, sliced into 1/2 inch thick flat pieces"
display_str_as_doc(SAMPLE_INGREDIENT)

In [None]:
def lemmatize_valid_nouns(ingredient):
    """Lemmatize valid nouns in a spaCy doc."""
    doc = nlp(ingredient)
    clean_tokens = []
    for token in doc:
        if token.is_alpha and token.pos_ in ["NOUN", "PROPN"]:
            clean_tokens.append(token.lemma_)
    return " ".join(clean_tokens)


lemmatized = lemmatize_valid_nouns(SAMPLE_INGREDIENT)
display_str_as_doc(lemmatized)

In [None]:
with open(f"culinary_stopwords.txt") as file:
    culinary_stopwords = [line.strip() for line in file]


def filter_stopwords(terms):
    """Filter out culinary stopwords from a string of terms."""
    clean_terms = []
    for term in terms.split():
        if term not in culinary_stopwords:
            clean_terms.append(term)
    return " ".join(clean_terms)


filtered = filter_stopwords(lemmatized)
display_str_as_doc(filtered)

In [None]:
with open("ingredient_synonyms.json") as file:
    ingredient_synonyms = json.load(file)


def handle_synonyms(ingredient):
    """Replace ingredient synonyms with a common name."""
    return ingredient_synonyms.get(ingredient, ingredient)

In [None]:
def preprocess_ingredient(ingredient):
    """Preprocess an ingredient string to extract key terms."""
    # Remove any text in parentheses
    ingredient = ingredient.split("(")[0]

    # Get the first item in a list of alternatives
    if " or " in ingredient:
        ingredient = ingredient.split(" or ")[0]

    lowered = ingredient.lower().strip()
    lemmatized = lemmatize_valid_nouns(lowered)
    filtered = filter_stopwords(lemmatized)
    common_name = handle_synonyms(filtered)
    return common_name


clean_ingredient = preprocess_ingredient(SAMPLE_INGREDIENT)
clean_ingredient

In [None]:
ingredients_by_recipe = []

for recipe in tqdm(recipes):
    ingredients = set()
    for terms in recipe["ingredients"]:
        clean_terms = preprocess_ingredient(terms)
        if clean_terms:
            ingredients.add(clean_terms)
    ingredients_by_recipe.append(" ".join(sorted(ingredients)))

In [None]:
ingredients_by_recipe[0]

## Vectorization


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
embeddings = vectorizer.fit_transform(ingredients_by_recipe)

## Dimensionality reduction

In [None]:
from sklearn.decomposition import TruncatedSVD

dimensions_to_keep = 3  # For 3D visualization
svd = TruncatedSVD(n_components=dimensions_to_keep)
reduced_embeddings = svd.fit_transform(embeddings)

## Clustering


In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 7
clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
cluster_labels = clustering.fit_predict(reduced_embeddings)

## Visualization


In [None]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame(
    {
        "x": reduced_embeddings[:, 0],
        "y": reduced_embeddings[:, 1],
        "z": reduced_embeddings[:, 2],
        "cluster": cluster_labels,
        "recipe_name": [recipe["name"] for recipe in recipes],
        "ingredients": ingredients_by_recipe,
    }
)

fig = px.scatter_3d(
    df,
    x="x",
    y="y",
    z="z",
    color="cluster",
    hover_name="recipe_name",
    hover_data=["ingredients"],
    width=800,
    height=800,
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(title="Ingredient-Based Clustering of Filipino Dishes")

fig.show()