# Preprocessing Panlasang Pinoy Recipe Dataset


In [1]:
import json
import spacy
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import plotly.graph_objects as go

# Load the data
with open("panlasang_pinoy_recipes.json") as file:
    recipes = json.load(file)

# Initialize NLP model
nlp = spacy.load("en_core_web_sm")

# Preprocess the ingredients for each recipe
processed_ingredients_list = []
recipe_names = [recipe["name"] for recipe in recipes]
for recipe in tqdm(recipes):
    recipe_ingredients = []
    for ingredient in recipe["ingredients"]:
        doc = nlp(ingredient.lower())
        valid_tokens = [token.lemma_ for token in doc if token.is_alpha]
        recipe_ingredients.append(" ".join(valid_tokens))
    processed_ingredients_list.append(" ".join(recipe_ingredients))

# Vectorize the processed ingredients
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(processed_ingredients_list)

# Dimensionality reduction
svd = TruncatedSVD(n_components=min(20, vectors.shape[1] - 1))
reduced_data = svd.fit_transform(vectors)

# Clustering
optimal_clusters = 7  # Adjust the number of clusters as needed
clustering = AgglomerativeClustering(
    n_clusters=optimal_clusters,
    linkage="ward",
)
cluster_labels = clustering.fit_predict(reduced_data)

# Visualization using PCA for 2D plotting
pca = PCA(n_components=2)
plot_data = pca.fit_transform(reduced_data)

# Plotting with Plotly
fig = go.Figure()

for i in range(optimal_clusters):
    cluster_data = plot_data[cluster_labels == i]
    fig.add_trace(
        go.Scatter(
            x=cluster_data[:, 0],
            y=cluster_data[:, 1],
            mode="markers",
            marker=dict(
                size=8,
                opacity=0.8,
            ),
            name=f"Cluster {i}",
            text=[
                recipe_names[j]
                for j in range(len(recipe_names))
                if cluster_labels[j] == i
            ],  # Display recipe names on hover
        )
    )

fig.update_layout(title_text="Visualization of Recipe Clusters (PCA)")
fig.show()

100%|██████████| 1873/1873 [00:55<00:00, 33.58it/s]
