# Comparison with word2vec

This notebook evaluates both _itembed_ and _word2vec_ on a simple classification task. This is by no mean a definitive conclusion of which method is better-suited for this kind of problem.

In [52]:
import numpy as np

from scipy.sparse import csr_matrix

import pandas as pd

from itembed import (
    pack_itemsets,
    prune_itemsets,
    initialize_syn,
    CompoundTask,
    UnsupervisedTask,
    SupervisedTask,
    train,
    softmax,
    normalize,
)

from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

In [None]:
# Initialize Bokeh
output_notebook()

In [2]:
# Load raw dataset
recipe_df = pd.read_json("recipe_train.json")
recipe_df.head(10)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge..."
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli..."
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo..."
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por..."
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-..."


In [None]:
# TODO split test set

In [3]:
# Get ingredients as a list of list of string
itemsets = recipe_df["ingredients"].values

In [4]:
# Pack itemsets into contiguous arrays
labels, indices, offsets = pack_itemsets(itemsets, min_count=min_count, min_length=2)
num_label = len(labels)
label_map = {label: i for i, label in enumerate(labels)}

In [11]:
# Both methods will use the same configuration, when applicable
num_dimension = 64
min_count = 5
num_negative = 5

In [17]:
# Helper to display scatter plot
def plot(syn):
    
    # Project with UMAP, using cosine similarity measure
    model = umap.UMAP(metric='cosine', verbose=1)
    projection = model.fit_transform(syn)
    
    # Pack as a Bokeh data source
    source = ColumnDataSource(data=dict(
        x=projection[:, 0],
        y=projection[:, 1],
        label=labels,
    ))

    # Create plot
    p = figure(
        width=900,
        height=600,
        tooltips=[
            ('label', '@label'),
        ],
    )

    # Draw tags as points
    p.scatter(
        'x', 'y',
        source=source,
    )

    # Show in notebook
    show(p)

In [22]:
# As word2vec does not sample pairs the same way, we need to estimate the equivalent number of epochs
lengths = np.array([len(itemset) for itemset in itemsets])

# itembed does a linear number of pairs per itemset (i.e. one per item)
itembed_pair_count = lengths.sum()
print(f"One epoch in itembed is {itembed_pair_count} pairs")

# word2vec does a quadratic number of pairs per itemset (i.e. all of them)
word2vec_pair_count = (lengths * (lengths - 1)).sum()
print(f"One epoch in word2vec is {word2vec_pair_count} pairs")

# Compute epoch count
factor = word2vec_pair_count / itembed_pair_count
num_epochs_word2vec = 5
num_epochs_itembed = int(num_epochs_word2vec * factor)
print(f"Hence, itembed needs {factor:.1f} times more epochs, i.e. {num_epochs_word2vec} vs {num_epochs_itembed}")

One epoch in itembed is 428275 pairs
One epoch in word2vec is 4963448 pairs
Hence, itembed needs 11.6 times more epochs, i.e. 5 vs 57


## itembed embeddings

In [None]:
# Initialize embeddings sets from uniform distribution
syn0 = initialize_syn(num_label, num_dimension)
syn1 = initialize_syn(num_label, num_dimension)

In [None]:
# Define unsupervised task, i.e. using co-occurrences
task = UnsupervisedTask(indices, offsets, syn0, syn1, num_negative=num_negative)

In [None]:
# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=num_epochs_itembed)

In [None]:
# Both embedding sets are equivalent, just choose one of them
itembed_syn = syn0

In [None]:
# Show projection
plot(itembed_syn)

## word2vec embeddings

In [12]:
# Train using "infinite" window, 
model = Word2Vec(
    itemsets,
    size=num_dimension,
    window=999999,
    min_count=min_count,
    sg=1,
    negative=num_negative,
    iter=num_epochs_word2vec,
    workers=4,
)

In [37]:
# The model re-shuffle the vocabulary, so we need to remap
syn = np.zeros((num_label, num_dimension), dtype=np.float32)
for i, word in enumerate(labels):
    vocab = model.wv.vocab.get(word)
    if vocab is not None:
        index = vocab.index
        syn[i] = model.wv.vectors[index]

In [38]:
# Keep the final embeddings
word2vec_syn = syn

In [None]:
# Show projection
plot(word2vec_syn)

## Evaluation

In [73]:
# Store itemsets as sparse matrix
rows = []
cols = []
for i, itemset in enumerate(itemsets):
    for item in itemset:
        j = label_map.get(item)
        if j is not None:
            rows.append(i)
            cols.append(j)
data = [1.0] * len(rows)
recipe = csr_matrix((data, (rows, cols)), shape=(len(itemsets), num_label), dtype=np.float32)

In [74]:
# Use mean ingredient embedding as formula embedding
X = recipe @ word2vec_syn

In [75]:
# Predict cuisine style based on ingredients
y = recipe_df["cuisine"]

In [76]:
# Use simple logistic regression
clf = LogisticRegression().fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [79]:
# TODO eval on test set
clf.score(X, y)

0.6453964901694574

In [80]:
clf.predict(X[:2])

array(['greek', 'southern_us'], dtype=object)

In [None]:
# TODO apply on both methods