# Online Retail Transactions

This notebook shows how to train dense vectors for products, based on transactions. See [this README](./data/retail/README.md) for more information about the dataset.

In [1]:
import numpy as np

import pandas as pd

import umap

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

from itembed import (
    pack_itemsets,
    initialize_syn,
    UnsupervisedTask,
    train,
    normalize,
)

In [2]:
# Initialize Bokeh
output_notebook()

In [3]:
# Make it reproducible
np.random.seed(42)

In [4]:
# Load CSV dump
names = pd.read_csv("./data/retail/name.csv")["name"]
transaction_df = pd.read_csv("./data/retail/transaction.csv")

In [5]:
# Get tags as a list of list of string
itemsets = transaction_df["Products"].str.split(";")
itemsets = itemsets.apply(lambda t: [int(i) for i in t])
itemsets = itemsets.tolist()

In [6]:
# Pack itemsets into contiguous arrays
labels, indices, offsets = pack_itemsets(itemsets, min_count=5, min_length=2)
labels = names[labels]
num_label = len(labels)

In [7]:
# Initialize embeddings sets from uniform distribution
num_dimension = 64
syn0 = initialize_syn(num_label, num_dimension)
syn1 = initialize_syn(num_label, num_dimension)

In [8]:
# Define unsupervised task, i.e. using co-occurrences
task = UnsupervisedTask(indices, offsets, syn0, syn1, num_negative=5)

In [9]:
# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=100)

100%|███████████████████████████████████████████████████████████████████████████| 62900/62900 [01:30<00:00, 695.51it/s]


In [10]:
# Both embedding sets are equivalent, just choose one of them
syn = syn0

In [11]:
# Project with UMAP, using cosine similarity measure
model = umap.UMAP(metric="cosine", verbose=1, random_state=42)
projection = model.fit_transform(syn)

UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, metric='cosine',
     random_state=42, verbose=1)
Construct fuzzy simplicial set
Fri Sep  3 11:42:08 2021 Finding Nearest Neighbors
Fri Sep  3 11:42:08 2021 Building RP forest with 8 trees
Fri Sep  3 11:42:09 2021 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	 3  /  12
	 4  /  12
	 5  /  12
	Stopping threshold met -- exiting after 5 iterations
Fri Sep  3 11:42:25 2021 Finished Nearest Neighbor Search
Fri Sep  3 11:42:28 2021 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Sep  3 11:42:38 2021 Finished embedding


In [12]:
# Pack as a Bokeh data source
source = ColumnDataSource(
    data=dict(
        x=projection[:, 0],
        y=projection[:, 1],
        label=labels,
    )
)

# Create plot
p = figure(
    width=900,
    height=600,
    tooltips=[
        ("label", "@label"),
    ],
)

# Draw tags as points
p.scatter(
    "x",
    "y",
    source=source,
)

# Show in notebook
show(p)