# Comparison with word2vec

This notebook evaluates both _itembed_ and _word2vec_ on a simple classification task. This is by no mean a definitive conclusion of which method is better-suited for this kind of problem.

See [this README](./data/recipes/README.md) for more information about the dataset. Note that the dataset is not provided and should be manually downloaded.

In [1]:
import numpy as np

from scipy.sparse import csr_matrix

import pandas as pd

from itembed import (
    pack_itemsets,
    initialize_syn,
    UnsupervisedTask,
    train,
)

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import umap

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

In [2]:
# Helper to display scatter plot
def plot(syn):
    
    # Project with UMAP, using cosine similarity measure
    model = umap.UMAP(metric="cosine", verbose=1, random_state=42)
    projection = model.fit_transform(syn)
    
    # Pack as a Bokeh data source
    source = ColumnDataSource(data=dict(
        x=projection[:, 0],
        y=projection[:, 1],
        label=labels,
    ))

    # Create plot
    p = figure(
        width=900,
        height=600,
        tooltips=[
            ("label", "@label"),
        ],
    )

    # Draw tags as points
    p.scatter(
        "x", "y",
        source=source,
    )

    # Show in notebook
    show(p)

In [3]:
# Initialize Bokeh
output_notebook()

In [4]:
# Make it reproducible
np.random.seed(42)

## Load Dataset

In [5]:
# Load raw dataset
df = pd.read_json("./data/recipes/train.json")
df.head(10)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge..."
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli..."
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo..."
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por..."
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-..."


In [6]:
# Split dataset
train_df, test_df = train_test_split(df, random_state=42)
print(f"Train set: {len(train_df)} ({100 * len(train_df) / len(df):.1f}%)")
print(f"Test set: {len(test_df)} ({100 * len(test_df) / len(df):.1f}%)")

Train set: 29830 (75.0%)
Test set: 9944 (25.0%)


In [7]:
# Get ingredients as a list of list of string
train_itemsets = train_df["ingredients"].values
test_itemsets = test_df["ingredients"].values

## Prepare Training

In [8]:
# Both methods will use the same configuration, when applicable
num_dimension = 64
min_count = 5
num_negative = 5
num_epochs_word2vec = 5

In [9]:
# As word2vec does not sample pairs the same way, we need to estimate the equivalent number of epochs for itembed
lengths = np.array([len(itemset) for itemset in train_itemsets])

# itembed does a linear number of pairs per itemset (i.e. one per item)
itembed_pair_count = lengths.sum()
print(f"One epoch in itembed is {itembed_pair_count} pairs")

# word2vec does a quadratic number of pairs per itemset (i.e. all of them)
word2vec_pair_count = (lengths * (lengths - 1)).sum()
print(f"One epoch in word2vec is {word2vec_pair_count} pairs")

# Compute epoch count
factor = word2vec_pair_count / itembed_pair_count
num_epochs_itembed = int(num_epochs_word2vec * factor)
print(f"Hence, itembed needs {factor:.1f} times more epochs, i.e. {num_epochs_word2vec} vs {num_epochs_itembed}")

One epoch in itembed is 321355 pairs
One epoch in word2vec is 3726490 pairs
Hence, itembed needs 11.6 times more epochs, i.e. 5 vs 57


## `itembed` Embeddings

In [10]:
# Pack itemsets into contiguous arrays
labels, indices, offsets = pack_itemsets(
    train_itemsets,
    min_count=min_count,
    min_length=2,
)
num_label = len(labels)
label_map = {label: i for i, label in enumerate(labels)}

In [11]:
# Initialize embeddings sets from uniform distribution
syn0 = initialize_syn(num_label, num_dimension)
syn1 = initialize_syn(num_label, num_dimension)

In [12]:
# Define unsupervised task, i.e. using co-occurrences
task = UnsupervisedTask(indices, offsets, syn0, syn1, num_negative=num_negative)

In [13]:
%%time

# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=num_epochs_itembed)

100%|██████████████████████████████████████████████████████████████████████████| 26562/26562 [00:16<00:00, 1568.67it/s]

Wall time: 16.9 s





In [14]:
# Both embedding sets are equivalent, just choose one of them
itembed_syn = syn0

In [15]:
# Show projection
plot(itembed_syn)

UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, metric='cosine',
     random_state=42, verbose=1)
Construct fuzzy simplicial set
Fri Sep  3 11:45:51 2021 Finding Nearest Neighbors
Fri Sep  3 11:45:54 2021 Finished Nearest Neighbor Search
Disconnection_distance = 1 has removed 426 edges.  This is not a problem as no vertices were disconnected.
Fri Sep  3 11:45:57 2021 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Sep  3 11:46:04 2021 Finished embedding


## `word2vec` Embeddings

In [16]:
%%time

# Train using "infinite" window, which emulates itembed behaviour
model = Word2Vec(
    train_itemsets,
    size=num_dimension,
    window=999999,
    min_count=min_count,
    sg=1,
    negative=num_negative,
    iter=num_epochs_word2vec,
    workers=1,
    seed=42,
)

Wall time: 11.9 s


In [17]:
# The model re-shuffle the vocabulary, so we need to remap
syn = np.zeros((num_label, num_dimension), dtype=np.float32)
for i, word in enumerate(labels):
    vocab = model.wv.vocab.get(word)
    if vocab is not None:
        index = vocab.index
        syn[i] = model.wv.vectors[index]

In [18]:
# Keep the final embeddings
word2vec_syn = syn

In [19]:
# Show projection
plot(word2vec_syn)

UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, metric='cosine',
     random_state=42, verbose=1)
Construct fuzzy simplicial set
Fri Sep  3 11:46:22 2021 Finding Nearest Neighbors
Fri Sep  3 11:46:22 2021 Finished Nearest Neighbor Search
Disconnection_distance = 1 has removed 15936 edges.  This is not a problem as no vertices were disconnected.
Fri Sep  3 11:46:22 2021 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Sep  3 11:46:29 2021 Finished embedding


## Evaluation

In [20]:
# Helper to convert itemsets into a sparse matrix
def to_sparse(itemsets):
    rows = []
    cols = []
    for i, itemset in enumerate(itemsets):
        for item in itemset:
            j = label_map.get(item)
            if j is not None:
                rows.append(i)
                cols.append(j)
    data = [1.0] * len(rows)
    return csr_matrix((data, (rows, cols)), shape=(len(itemsets), num_label), dtype=np.float32)

# Convert train and test sets
train_matrix = to_sparse(train_itemsets)
test_matrix = to_sparse(test_itemsets)

In [21]:
# Helper to train and evaluate a simple model
def evaluate(syn):

    # Use mean ingredient embedding as formula embedding
    train_X = train_matrix @ syn
    test_X = test_matrix @ syn

    # Predict cuisine style based on ingredients
    train_y = train_df["cuisine"]
    test_y = test_df["cuisine"]

    # Use simple logistic regression
    clf = LogisticRegression(max_iter=300).fit(train_X, train_y)
    
    # Evaluate on both sets
    train_score = clf.score(train_X, train_y)
    test_score = clf.score(test_X, test_y)
    
    # Return classifier, just in case
    return train_score, test_score, clf

In [22]:
# Evaluate
itembed_train_score, itembed_test_score, _ = evaluate(itembed_syn)
word2vec_train_score, word2vec_test_score, _ = evaluate(word2vec_syn)

In [23]:
print("itembed:")
print(f"  train: {100 * itembed_train_score:.2f}%")
print(f"  test: {100 * itembed_test_score:.2f}%")
print("word2vec:")
print(f"  train: {100 * word2vec_train_score:.2f}%")
print(f"  test: {100 * word2vec_test_score:.2f}%")

itembed:
  train: 73.72%
  test: 71.97%
word2vec:
  train: 73.08%
  test: 71.32%
