# Cooking Recipes

This notebook shows how to train dense vector for both unsupervised and supervised scenario, using cooking recipes.

See [this README](./data/recipes/README.md) for more information about the dataset. Note that the dataset is not provided and should be manually downloaded.

In [1]:
import numpy as np

from scipy.sparse import csr_matrix

import pandas as pd

import umap

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

from itembed import (
    pack_itemsets,
    prune_itemsets,
    initialize_syn,
    CompoundTask,
    UnsupervisedTask,
    SupervisedTask,
    train,
    softmax,
    normalize,
)

In [2]:
# Initialize Bokeh
output_notebook()

In [3]:
# Load raw dataset
recipe_df = pd.read_json("./data/recipes/train.json")
recipe_df.head(10)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge..."
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli..."
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo..."
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por..."
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-..."


In [4]:
# Get ingredients as a list of list of string
ingredient_itemsets = recipe_df.ingredients.values

# Pack itemsets into contiguous arrays
(
    ingredient_labels,
    ingredient_indices,
    ingredient_offsets,
) = pack_itemsets(ingredient_itemsets, min_count=10, min_length=0)
num_ingredient_label = len(ingredient_labels)

In [5]:
# Get styles as a list of list of string
style_itemsets = recipe_df.cuisine.values[:, None]

# Pack itemsets into contiguous arrays
(
    style_labels,
    style_indices,
    style_offsets,
) = pack_itemsets(style_itemsets, min_count=1, min_length=1)
num_style_label = len(style_labels)

In [6]:
# All embeddings must have the same size
num_dimension = 64

# Initialize ingredient embeddings sets for ingredients
ingredient_syn0 = initialize_syn(num_ingredient_label, num_dimension)
ingredient_syn1 = initialize_syn(num_ingredient_label, num_dimension)

# Initialize a single embedding set for cuisine styles
style_syn0 = initialize_syn(num_style_label, num_dimension)

In [7]:
# Define unsupervised task, i.e. using co-occurrences
(
    filtered_ingredient_indices,
    filtered_ingredient_offsets,
) = prune_itemsets(ingredient_indices, ingredient_offsets, min_length=2)
ingredient_task = UnsupervisedTask(
    filtered_ingredient_indices,
    filtered_ingredient_offsets,
    ingredient_syn0,
    ingredient_syn1,
    num_negative=5,
)

# Define supervised task, i.e. using ingredient to cuisine style mapping
ingredient_to_style_task = SupervisedTask(
    ingredient_indices,
    ingredient_offsets,
    style_indices,
    style_offsets,
    ingredient_syn0,
    style_syn0,
    num_negative=5,
)

# Combine tasks
task = CompoundTask(ingredient_task, ingredient_to_style_task)

In [8]:
# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=100)

100%|██████████████████████████████████████████████████████████████████████████| 62200/62200 [01:01<00:00, 1011.77it/s]


In [9]:
# Both embedding sets are equivalent, just choose one of them
ingredient_syn = ingredient_syn0
style_syn = style_syn0

In [10]:
# Project with UMAP, using cosine similarity measure
model = umap.UMAP(metric="cosine", verbose=1)
projection = model.fit_transform(ingredient_syn)

UMAP(a=None, angular_rp_forest=True, b=None,
     force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
     local_connectivity=1.0, low_memory=False, metric='cosine',
     metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, output_metric='euclidean',
     output_metric_kwds=None, random_state=None, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, unique=False, verbose=1)
Construct fuzzy simplicial set
Mon Jul  6 14:09:23 2020 Finding Nearest Neighbors
Mon Jul  6 14:09:25 2020 Finished Nearest Neighbor Search
Mon Jul  6 14:09:27 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  

In [11]:
# Pack as a Bokeh data source
source = ColumnDataSource(data=dict(
    x=projection[:, 0],
    y=projection[:, 1],
    label=ingredient_labels,
))

# Create plot
p = figure(
    width=900,
    height=600,
    tooltips=[
        ("label", "@label"),
    ],
)

# Draw tags as points
p.scatter(
    "x", "y",
    source=source,
)

# Show in notebook
show(p)

In [12]:
# Cosine similarity is equivalent to dot product with normalized vectors
ingredient_syn_normalized = normalize(ingredient_syn)
style_syn_normalized = normalize(style_syn)

In [13]:
# Closest ingredients to apples
i = ingredient_labels.index("apples")
similarities = ingredient_syn_normalized @ ingredient_syn_normalized[i]
for j in np.argsort(-similarities)[:10]:
    print("#{} {}: {}".format(j, ingredient_labels[j], similarities[j]))

#452 apples: 0.9999999403953552
#569 granny smith apples: 0.7230942845344543
#1891 tart apples: 0.6819778680801392
#2048 gala apples: 0.6746957898139954
#1286 golden delicious apples: 0.6271365284919739
#2300 wheels: 0.6031620502471924
#627 hard-boiled egg: 0.5725231766700745
#971 apple juice: 0.5711477398872375
#1612 calvados: 0.566547691822052
#2243 dried fruit: 0.5598695278167725


In [14]:
# Closest styles to greek
i = style_labels.index("greek")
similarities = style_syn_normalized @ style_syn_normalized[i]
for j in np.argsort(-similarities)[:5]:
    print("#{} {}: {}".format(j, style_labels[j], similarities[j]))

#9 greek: 0.9999998807907104
#13 moroccan: 0.5197018384933472
#0 italian: 0.49499642848968506
#10 spanish: 0.4829043447971344
#5 french: 0.46361032128334045


In [15]:
# Store recipes as sparse matrix
data = np.ones(len(ingredient_indices), dtype=np.float32)
indices = ingredient_indices
indptr = ingredient_offsets
M = len(ingredient_offsets) - 1
N = num_ingredient_label
matrix = csr_matrix((data, indices, indptr), shape=(M, N))

In [16]:
# Do prediction
logits = matrix @ ingredient_syn0 @ style_syn0.T
probabilities = softmax(logits)
labels = probabilities.argmax(axis=1)

# Apply on training set
augmented_recipe_df = recipe_df.copy()
augmented_recipe_df["prediction"] = [style_labels[i] for i in labels]
augmented_recipe_df["confidence"] = probabilities[np.arange(len(labels)), labels]
augmented_recipe_df.head(10)

Unnamed: 0,id,cuisine,ingredients,prediction,confidence
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",mexican,0.764093
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",southern_us,0.924874
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",mexican,0.811258
3,22213,indian,"[water, vegetable oil, wheat, salt]",mexican,0.49368
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",indian,1.0
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge...",southern_us,0.998673
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli...",mexican,0.999946
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo...",italian,0.984641
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por...",mexican,1.0
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-...",italian,0.999709
