# Ingredient-Based Clustering of Pinoy Dishes

The ambition is to identify clusters of Filipino dishes based on their ingredients. The results can be used to define the most common ingredients of a Filipino pantry. This project leverages techniques from data mining, natural language processing, and unsupervised learning. The dataset is a collection of Filipino recipes from various online recipe websites. The dataset contains the name, ingredients, and instructions of the recipes. 

## Data loading and preview

### Read JSON files

In [None]:
import json
import os

RECIPES_PATH = "data/recipes"


def load_recipes(path):
    """Combines all recipe data into a single list."""
    files = [file for file in os.listdir(path)]

    recipes = []
    for file in files:
        file_path = os.path.join(path, file)
        with open(file_path, encoding="utf-8") as f:
            recipe_data = json.load(f)
            recipes.extend(recipe_data)

    return recipes


recipes = load_recipes(RECIPES_PATH)
recipes[0]

### Convert JSON to DataFrame

In [None]:
import pandas as pd


recipe_df = pd.DataFrame(recipes)
recipe_df.head()

In [None]:
recipe_df.info()

### Display some statistics

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns


def plot_ingredient_analysis(
    ingredient_series,
    n=30,
    most_common=True,
    figsize=(8, 10),
):
    """Plots ingredient distributions for most/least common ingredients."""

    all_ingredients = ingredient_series.dropna().explode()
    ingredient_counts = all_ingredients.value_counts()

    _, axs = plt.subplots(
        2,
        1,
        figsize=figsize,
        gridspec_kw={"height_ratios": [1, 5]},
    )

    counts = ingredient_series.dropna().apply(len)
    sns.histplot(counts, kde=True, binwidth=1, ax=axs[0])
    axs[0].set_title(f"Number of ingredients in {len(counts)} recipes")
    axs[0].set_xlabel("Number of ingredients")
    axs[0].set_ylabel("Number of recipes")

    if most_common:
        top_n_ingredients = ingredient_counts.head(n)
    else:
        top_n_ingredients = ingredient_counts.tail(n)

    ylabels = [
        f"{i[:20]:>20}{'...' if len(i) > 20 else ''}"
        for i in top_n_ingredients.index
    ]
    sns.barplot(x=top_n_ingredients.values, y=ylabels, ax=axs[1])

    indicator = "Most" if most_common else "Least"
    axs[1].set_title(f"Top {n} {indicator} Common Ingredients")
    axs[1].set_xlabel("Number of Recipes")
    axs[1].set_ylabel("Ingredient")

    plt.tight_layout()
    plt.show()


plot_ingredient_analysis(recipe_df["ingredients"])

## Preprocessing


### Remove recipes with no ingredients

In [None]:
recipe_df = recipe_df.dropna(subset=["ingredients"])
recipe_df.info()

### Tokenize text for manageable pieces

In [None]:
import spacy
from tabulate import tabulate

SAMPLE_INGREDIENT = "small lemon lemons or 6 to 7 pieces calamansi"

nlp = spacy.load("en_core_web_lg")


def display_tokens(text):
    """Display token attributes of a given string.""",
    rows = []
    for token in nlp(text):
        row = [
            token.text,
            token.lemma_,
            token.pos_,
            token.tag_,
            token.dep_,
            token.is_alpha,
        ]
        rows.append(row)
    headers = ["text", "lemma", "pos", "tag", "dep", "is_alpha"]
    print(tabulate(rows, headers=headers))


display_tokens(SAMPLE_INGREDIENT)

### Lemmatize words and remove non-ingredient words

In [None]:
def fix_pos_tags(token):
    """Fixes POS tags for certain words in a string.""",
    if token.text in ["cauliflower", "baking"]:
        token.pos_ = "NOUN"  # Override POS tag for some words
    return token


def lemmatize_valid_nouns(ingredient):
    """Lemmatize valid nouns in a string.""",
    doc = nlp(ingredient)
    valid_lemmas = []
    for token in doc:
        token = fix_pos_tags(token)
        if (
            token.is_alpha
            and not token.is_stop
            and token.pos_ in ["NOUN", "PROPN"]
        ):
            valid_lemmas.append(token.lemma_)
    return " ".join(valid_lemmas)


lemmatized = lemmatize_valid_nouns(SAMPLE_INGREDIENT)
display_tokens(lemmatized)

### Remove stop words related to culinary

In [None]:
with open(f"data/culinary_stopwords.txt", encoding="utf-8") as file:
    stopwords = [line.strip() for line in file]


def filter_stopwords(text):
    """Filter out stopwords from a string."""
    meaningful_terms = []
    for term in text.split():
        if term not in stopwords:
            meaningful_terms.append(term)
    return " ".join(meaningful_terms)


filtered = filter_stopwords(lemmatized)
display_tokens(filtered)

### Handle synonyms at word level

In [None]:
with open("data/word_synonyms.json") as file:
    word_synonyms = json.load(file)


def handle_word_synonym(ingredient):
    """Replace ingredient synonyms with a common name."""
    common_words = []
    for word in ingredient.split():
        common_words.append(word_synonyms.get(word, word))
    return " ".join(common_words)


normalized_words = handle_word_synonym(filtered)
display_tokens(normalized_words)

### Remove duplicate terms

In [None]:
def remove_duplicates(text):
    """Remove duplicate terms while preserving order."""
    seen = set()
    unique = []
    for ingredient in text.split():
        if ingredient not in seen:
            unique.append(ingredient)
            seen.add(ingredient)
    return " ".join(unique)

unique = remove_duplicates(normalized_words)
display_tokens(unique)

### Handle synonyms at phrase level

In [None]:
with open("data/phrase_synonyms.json") as file:
    phrase_synonyms = json.load(file)


def handle_phrase_synonym(ingredient):
    """Replace ingredient synonyms with a common name."""
    return phrase_synonyms.get(ingredient, ingredient)

common_name = handle_phrase_synonym(unique)
display_tokens(common_name)

### Add flavor tags to ingredients

In [None]:
def map_ingredient_to_flavor(ingredient):
    """Maps an ingredient to a flavor."""
    with open("data/flavor_map.json") as file:
        flavor_map_data = json.load(file)

    ingredient_flavors = {}
    for entry in flavor_map_data:
        for flavor, ingredients in entry.items():
            for i in ingredients:
                ingredient_flavors[i] = flavor

    flavor = ingredient_flavors.get(ingredient, "")
    return f"{flavor} {ingredient}"


map_ingredient_to_flavor(common_name)

### Define the preprocessing pipeline

In [None]:
import re


def filter_common_ingredient(ingredient):
    commons = [
        "water",
        "oil",
        "salt",
        "onion",
        "sugar",
        "pepper",
        "garlic",
    ]
    return ingredient if ingredient not in commons else None


def preprocess_ingredient(ingredient):
    """Preprocess an ingredient string to extract key terms.""",
    no_parenthesis = re.sub(r"\([^)]*\)", "", ingredient)
    first_option = no_parenthesis.split(" or ")[0]
    formatted = first_option.lower().strip()

    lemmatized = lemmatize_valid_nouns(formatted)
    filtered = filter_stopwords(lemmatized)
    word_synonym = handle_word_synonym(filtered)
    unique = remove_duplicates(word_synonym)
    phrase_synonym = handle_phrase_synonym(unique)

    if isinstance(phrase_synonym, str):
        valid_ingredient = filter_common_ingredient(phrase_synonym)
        with_flavor = map_ingredient_to_flavor(valid_ingredient)
    elif isinstance(phrase_synonym, list):
        valid_ingredient = [
            filter_common_ingredient(phrase) for phrase in phrase_synonym
        ]
        with_flavor = [
            map_ingredient_to_flavor(phrase) for phrase in valid_ingredient
        ]

    return with_flavor


clean_ingredient = preprocess_ingredient(SAMPLE_INGREDIENT)
clean_ingredient

### Apply the preprocessing pipeline

In [None]:
from tqdm.auto import tqdm

tqdm.pandas()


def clean_ingredients(ingredients):
    """Applies preprocessing to a list of ingredients and flattens the list."""
    cleaned_ingredients = []
    for ingredient in ingredients:
        cleaned = preprocess_ingredient(ingredient)
        if isinstance(cleaned, list):
            cleaned_ingredients.extend(cleaned)
        else:
            cleaned_ingredients.append(cleaned)

    return list(sorted(set(filter(None, cleaned_ingredients))))


recipe_df["cleaned_ingredients"] = (
    recipe_df["ingredients"].progress_apply(clean_ingredients)
)

In [None]:
plot_ingredient_analysis(recipe_df["cleaned_ingredients"])

### Save list of preprocessed ingredients

In [None]:
cleaned = recipe_df["cleaned_ingredients"]
cleaned = cleaned.dropna().explode().unique().astype(str)
cleaned.sort()

filename = f"data/cleaned_ingredients.txt"
with open(filename, "w", encoding="utf-8") as file:
    file.writelines("\n".join(cleaned))

### Remove rare ingredients

In [None]:
MIN_FREQUENCY = 5

counts = recipe_df["cleaned_ingredients"].explode().value_counts()
rare_ingredients = counts[counts <= MIN_FREQUENCY].index

with open("data/rare_ingredients.txt", "w", encoding="utf-8") as file:
    file.writelines("\n".join(sorted(rare_ingredients)))

In [None]:
def remove_common_ingredients(ingredients):
    """Remove rare ingredients from a list."""
    return [
        ingredient
        for ingredient in ingredients
        if ingredient not in rare_ingredients
    ]

reduced_df = recipe_df.copy()

reduced_df.loc[:, "cleaned_ingredients"] = (
    reduced_df["cleaned_ingredients"].apply(remove_common_ingredients)
)

plot_ingredient_analysis(reduced_df["cleaned_ingredients"])

In [None]:
plot_ingredient_analysis(reduced_df["cleaned_ingredients"], most_common=False)

### Remove recipes with too few or too many ingredients

In [None]:
# TODO: Get the 3rd quartile minus 1st quartile (75%)

MIN_INGREDIENT = 4
MAX_INGREDIENT = 9

counts = reduced_df["cleaned_ingredients"].apply(len)
reduced_df = reduced_df[(counts >= MIN_INGREDIENT) & (counts <= MAX_INGREDIENT)]

plot_ingredient_analysis(reduced_df["cleaned_ingredients"])

## Vectorization


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")
ingredients_as_texts = [
    " ".join(sorted(ingredients))
    for ingredients in reduced_df["cleaned_ingredients"]
]
embeddings = model.encode(
    ingredients_as_texts,
    show_progress_bar=True,
)

In [None]:
embeddings.shape

## Dimensionality reduction

In [None]:
from sklearn.decomposition import TruncatedSVD

dimensions_to_keep = 2
svd = TruncatedSVD(n_components=dimensions_to_keep)
reduced_embeddings = svd.fit_transform(embeddings)

## Clustering


In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 7
clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
cluster_labels = clustering.fit_predict(reduced_embeddings)

## Visualization


In [None]:
import plotly.express as px
import pandas as pd


def plot_clusters_3d(reduced_embeddings, cluster_labels, recipe_df):
    """Plots a 3D scatter plot of recipe clusters."""

    final_df = pd.DataFrame(
        {
            "x": reduced_embeddings[:, 0],
            "y": reduced_embeddings[:, 1],
            # "z": reduced_embeddings[:, 2],
            "cluster": cluster_labels.astype(str),
            "recipe_name": recipe_df["name"],
            "cleaned_ingredients": [
                "<br>".join(ingredients)
                for ingredients in recipe_df["cleaned_ingredients"]
            ],
        }
    )

    fig = px.scatter(
        final_df,
        x="x",
        y="y",
        # z="z",
        color="cluster",
        hover_name="recipe_name",
        hover_data=["cleaned_ingredients"],
        width=800,
        height=1000,
        color_discrete_sequence=px.colors.qualitative.Bold,
    )

    fig.update_traces(marker=dict(size=5))
    fig.update_layout(title="Ingredient-Based Clustering of Filipino Dishes")

    unique_clusters = final_df["cluster"].unique()
    buttons = [
        dict(
            label=f"Cluster {cluster}",
            method="update",
            args=[
                {"visible": [cluster == c for c in unique_clusters]},
                {"title": f"Ingredient-Based Clustering - Cluster {cluster}"},
            ],
        )
        for cluster in unique_clusters
    ]
    fig.update_layout(
        updatemenus=[
            dict(
                type="buttons",
                direction="down",
                buttons=buttons,
            )
        ]
    )

    fig.show()


plot_clusters_3d(reduced_embeddings, cluster_labels, reduced_df)