# Online Retail Transactions

This notebook shows how to train dense vectors for products, based on transactions. See [this README](./data/retail/README.md) for more information about the dataset.

In [1]:
import numpy as np

import pandas as pd

import umap

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

from itembed import (
    pack_itemsets,
    initialize_syn,
    UnsupervisedTask,
    train,
    normalize,
)

In [2]:
# Initialize Bokeh
output_notebook()

In [3]:
# Load CSV dump
names = pd.read_csv("./data/retail/name.csv")["name"]
transaction_df = pd.read_csv("./data/retail/transaction.csv")

In [4]:
# Get tags as a list of list of string
itemsets = transaction_df["Products"].str.split(";")
itemsets = itemsets.apply(lambda t: [int(i) for i in t])
itemsets = itemsets.tolist()

In [5]:
# Pack itemsets into contiguous arrays
labels, indices, offsets = pack_itemsets(itemsets, min_count=5)
labels = names[labels]
num_label = len(labels)

In [6]:
# Initialize embeddings sets from uniform distribution
num_dimension = 64
syn0 = initialize_syn(num_label, num_dimension)
syn1 = initialize_syn(num_label, num_dimension)

In [7]:
# Define unsupervised task, i.e. using co-occurrences
task = UnsupervisedTask(indices, offsets, syn0, syn1, num_negative=5)

In [8]:
# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=100)

100%|███████████████████████████████████████████████████████████████████████████| 62900/62900 [01:42<00:00, 613.42it/s]


In [9]:
# Both embedding sets are equivalent, just choose one of them
syn = syn0

In [10]:
# Project with UMAP, using cosine similarity measure
model = umap.UMAP(metric="cosine", verbose=1)
projection = model.fit_transform(syn)

UMAP(a=None, angular_rp_forest=True, b=None,
     force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
     local_connectivity=1.0, low_memory=False, metric='cosine',
     metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, output_metric='euclidean',
     output_metric_kwds=None, random_state=None, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, unique=False, verbose=1)
Construct fuzzy simplicial set
Thu Nov 12 12:09:40 2020 Finding Nearest Neighbors
Thu Nov 12 12:09:40 2020 Building RP forest with 8 trees
Thu Nov 12 12:09:41 2020 NN descent for 12 iterations
	 0  /  12
	 1  /  12
	 2  /  12
	 3  /  12
	 4  /  12
Thu Nov 12 12:09:53 2020 Finished Nearest Neighbor Search
Thu Nov 12 12:09:56 2020 Construct embedding
	completed  0  /  500 epochs
	comp

In [11]:
# Pack as a Bokeh data source
source = ColumnDataSource(data=dict(
    x=projection[:, 0],
    y=projection[:, 1],
    label=labels,
))

# Create plot
p = figure(
    width=900,
    height=600,
    tooltips=[
        ("label", "@label"),
    ],
)

# Draw tags as points
p.scatter(
    "x", "y",
    source=source,
)

# Show in notebook
show(p)