# Stack Overflow tags

This notebook shows how to train dense vectors for question tags. See [this README](./data/stackoverflow/README.md) for more information about the dataset.

In [1]:
import numpy as np

import pandas as pd

import umap

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

from itembed import (
    pack_itemsets,
    initialize_syn,
    UnsupervisedTask,
    train,
    normalize,
)

In [2]:
# Initialize Bokeh
output_notebook()

In [3]:
# Make it reproducible
np.random.seed(42)

In [4]:
# Load raw dataset
tag_df = pd.read_csv("./data/stackoverflow/tags.csv")
tag_df.head(10)

Unnamed: 0,id,tags
0,4,c#;floating-point;type-conversion;double;decimal
1,11,c#;datetime;time;datediff;relative-time-span
2,13,html;browser;timezone;user-agent;timezone-offset
3,16,c#;linq;web-services;.net-3.5
4,17,mysql;database;binary-data;data-storage
5,19,performance;algorithm;language-agnostic;unix;pi
6,25,c++;c;sockets;mainframe;zos
7,36,sql;sql-server;datatable;rdbms
8,39,c#;.net;vb.net;timer
9,42,php;plugins;architecture;hook


In [5]:
# Get tags as a list of list of string
itemsets = tag_df.tags.str.split(";").values

In [6]:
# Pack itemsets into contiguous arrays
labels, indices, offsets = pack_itemsets(itemsets, min_count=10, min_length=2)
num_label = len(labels)

In [7]:
# Initialize embeddings sets from uniform distribution
num_dimension = 64
syn0 = initialize_syn(num_label, num_dimension)
syn1 = initialize_syn(num_label, num_dimension)

In [8]:
# Define unsupervised task, i.e. using co-occurrences
task = UnsupervisedTask(indices, offsets, syn0, syn1, num_negative=5)

In [9]:
# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=100)

100%|██████████████████████████████████████████████████████████████████████| 1562400/1562400 [07:12<00:00, 3612.82it/s]


In [10]:
# Both embedding sets are equivalent, just choose one of them
syn = syn0

In [11]:
# Project with UMAP, using cosine similarity measure
model = umap.UMAP(metric="cosine", verbose=1, random_state=42)
projection = model.fit_transform(syn)

UMAP(angular_rp_forest=True, dens_frac=0.0, dens_lambda=0.0, metric='cosine',
     random_state=42, verbose=1)
Construct fuzzy simplicial set
Fri Sep  3 11:48:46 2021 Finding Nearest Neighbors
Fri Sep  3 11:48:46 2021 Building RP forest with 11 trees
Fri Sep  3 11:48:47 2021 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	Stopping threshold met -- exiting after 5 iterations
Fri Sep  3 11:49:02 2021 Finished Nearest Neighbor Search
Fri Sep  3 11:49:04 2021 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Fri Sep  3 11:49:14 2021 Finished embedding


In [12]:
# Pack as a Bokeh data source
source = ColumnDataSource(
    data=dict(
        x=projection[:, 0],
        y=projection[:, 1],
        label=labels,
    )
)

# Create plot
p = figure(
    width=900,
    height=600,
    tooltips=[
        ("label", "@label"),
    ],
)

# Draw tags as points
p.scatter(
    "x",
    "y",
    source=source,
)

# Show in notebook
show(p)

In [13]:
# Cosine similarity is equivalent to dot product with normalized vectors
syn_normalized = normalize(syn)

In [14]:
# Closest tags to apples
i = labels.index("word")
similarities = syn_normalized @ syn_normalized[i]
for j in np.argsort(-similarities)[:10]:
    print("#{} {}: {}".format(j, labels[j], similarities[j]))

#2679 word: 1.0
#13959 phrases: 0.8163633346557617
#4274 words: 0.7451512813568115
#11359 sentence: 0.7253150939941406
#11833 phrase: 0.7204039692878723
#9693 string-split: 0.7081019878387451
#10262 alphabet: 0.7034153342247009
#7991 chars: 0.7018148899078369
#2834 string-matching: 0.700367271900177
#6726 linguistics: 0.6950852274894714
