# Cooking Recipes

This notebook shows how to train dense vector for both unsupervised and supervised scenario, using cooking recipes.

See [this README](./data/recipes/README.md) for more information about the dataset. Note that the dataset is not provided and should be manually downloaded.

In [1]:
import numpy as np

from scipy.sparse import csr_matrix

import pandas as pd

import umap

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

from itembed import (
    pack_itemsets,
    prune_itemsets,
    initialize_syn,
    CompoundTask,
    UnsupervisedTask,
    SupervisedTask,
    train,
    softmax,
    normalize,
)

In [2]:
# Initialize Bokeh
output_notebook()

In [3]:
# Make it reproducible
np.random.seed(42)

In [4]:
# Load raw dataset
recipe_df = pd.read_json("./data/recipes/train.json")
recipe_df.head(10)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge..."
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli..."
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo..."
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por..."
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-..."


In [5]:
# Get ingredients as a list of list of string
ingredient_itemsets = recipe_df["ingredients"].values

# Pack itemsets into contiguous arrays
(
    ingredient_labels,
    ingredient_indices,
    ingredient_offsets,
) = pack_itemsets(ingredient_itemsets, min_count=10, min_length=0)
num_ingredient_label = len(ingredient_labels)

In [6]:
# Get styles as a list of list of string
style_itemsets = recipe_df["cuisine"].values[:, None]

# Pack itemsets into contiguous arrays
(
    style_labels,
    style_indices,
    style_offsets,
) = pack_itemsets(style_itemsets, min_count=1, min_length=0)
num_style_label = len(style_labels)

In [7]:
# Not every itemset can be used for each task
ingredient_lengths = ingredient_offsets[1:] - ingredient_offsets[:-1]
style_lengths = style_offsets[1:] - style_offsets[:-1]

# Unsupervised itemsets require at least 2 items
unsupervised_mask = ingredient_lengths >= 2

# Supervised itemsets require at least 1 item
supervised_mask = (ingredient_lengths >= 1) & (style_lengths >= 1)

In [8]:
# All embeddings must have the same size
num_dimension = 64

# Initialize ingredient embeddings sets for ingredients
ingredient_syn0 = initialize_syn(num_ingredient_label, num_dimension)
ingredient_syn1 = initialize_syn(num_ingredient_label, num_dimension)

# Initialize a single embedding set for cuisine styles
style_syn0 = initialize_syn(num_style_label, num_dimension)

In [9]:
# Apply unsupervised mask
(
    filtered_ingredient_indices,
    filtered_ingredient_offsets,
) = prune_itemsets(ingredient_indices, ingredient_offsets, mask=unsupervised_mask)

# Define unsupervised task, i.e. using co-occurrences
ingredient_task = UnsupervisedTask(
    filtered_ingredient_indices,
    filtered_ingredient_offsets,
    ingredient_syn0,
    ingredient_syn1,
    num_negative=5,
)

In [10]:
# Apply supervised mask
(
    filtered_ingredient_indices,
    filtered_ingredient_offsets,
) = prune_itemsets(ingredient_indices, ingredient_offsets, mask=supervised_mask)
(
    filtered_style_indices,
    filtered_style_offsets,
) = prune_itemsets(style_indices, style_offsets, mask=supervised_mask)

# Define supervised task, i.e. using ingredient to cuisine style mapping
ingredient_to_style_task = SupervisedTask(
    filtered_ingredient_indices,
    filtered_ingredient_offsets,
    filtered_style_indices,
    filtered_style_offsets,
    ingredient_syn0,
    style_syn0,
    num_negative=5,
)

In [11]:
# Combine tasks
task = CompoundTask(ingredient_task, ingredient_to_style_task)

In [12]:
# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=100)

100%|██████████████████████████████████████████████████████████████████████████| 62200/62200 [00:58<00:00, 1061.17it/s]


In [13]:
# Both embedding sets are equivalent, just choose one of them
ingredient_syn = ingredient_syn0
style_syn = style_syn0

In [14]:
# Project with UMAP, using cosine similarity measure
model = umap.UMAP(metric="cosine", verbose=1, random_state=42)
projection = model.fit_transform(ingredient_syn)

UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, metric='cosine',
     random_state=42, verbose=1)
Construct fuzzy simplicial set
Fri Sep  3 12:43:30 2021 Finding Nearest Neighbors
Fri Sep  3 12:43:33 2021 Finished Nearest Neighbor Search
Disconnection_distance = 1 has removed 233460 edges.  This is not a problem as no vertices were disconnected.
Fri Sep  3 12:43:35 2021 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Sep  3 12:43:40 2021 Finished embedding


In [15]:
# Pack as a Bokeh data source
source = ColumnDataSource(
    data=dict(
        x=projection[:, 0],
        y=projection[:, 1],
        label=ingredient_labels,
    )
)

# Create plot
p = figure(
    width=900,
    height=600,
    tooltips=[
        ("label", "@label"),
    ],
)

# Draw tags as points
p.scatter(
    "x",
    "y",
    source=source,
)

# Show in notebook
show(p)

In [16]:
# Cosine similarity is equivalent to dot product with normalized vectors
ingredient_syn_normalized = normalize(ingredient_syn)
style_syn_normalized = normalize(style_syn)

In [17]:
# Closest ingredients to apples
i = ingredient_labels.index("apples")
similarities = ingredient_syn_normalized @ ingredient_syn_normalized[i]
for j in np.argsort(-similarities)[:10]:
    print("#{} {}: {}".format(j, ingredient_labels[j], similarities[j]))

#452 apples: 1.0
#569 granny smith apples: 0.7091999650001526
#1891 tart apples: 0.69163578748703
#2048 gala apples: 0.6536399126052856
#1286 golden delicious apples: 0.6279377937316895
#971 apple juice: 0.5824869871139526
#1612 calvados: 0.5787948369979858
#2300 wheels: 0.5740875005722046
#2243 dried fruit: 0.5700628757476807
#311 peaches: 0.5412274599075317


In [18]:
# Closest styles to greek
i = style_labels.index("greek")
similarities = style_syn_normalized @ style_syn_normalized[i]
for j in np.argsort(-similarities)[:5]:
    print("#{} {}: {}".format(j, style_labels[j], similarities[j]))

#9 greek: 1.0000001192092896
#13 moroccan: 0.5139353275299072
#0 italian: 0.4910340905189514
#10 spanish: 0.4846884608268738
#5 french: 0.46558481454849243


In [19]:
# Store recipes as sparse matrix
data = np.ones(len(ingredient_indices), dtype=np.float32)
indices = ingredient_indices
indptr = ingredient_offsets
M = len(ingredient_offsets) - 1
N = num_ingredient_label
matrix = csr_matrix((data, indices, indptr), shape=(M, N))

In [20]:
# Do prediction
logits = matrix @ ingredient_syn0 @ style_syn0.T
probabilities = softmax(logits)
labels = probabilities.argmax(axis=1)

# Apply on training set
augmented_recipe_df = recipe_df.copy()
augmented_recipe_df["prediction"] = [style_labels[i] for i in labels]
augmented_recipe_df["confidence"] = probabilities[np.arange(len(labels)), labels]
augmented_recipe_df.head(10)

Unnamed: 0,id,cuisine,ingredients,prediction,confidence
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",mexican,0.688536
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",southern_us,0.909131
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",mexican,0.829995
3,22213,indian,"[water, vegetable oil, wheat, salt]",mexican,0.477049
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",indian,1.0
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge...",southern_us,0.998861
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli...",mexican,0.99993
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo...",italian,0.973192
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por...",mexican,1.0
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-...",italian,0.999652
