# Stack Overflow Tags

This notebook shows how to train dense vectors for question tags.

The dataset has been extracted using the [Stack Exchange Data Explorer](https://data.stackexchange.com/), and is released under [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/). The first one million questions with at least 4 tags were extracted:

```sql
SELECT Id, Tags
FROM Posts
WHERE LEN(Tags) - LEN(REPLACE(Tags, '<', '')) >= 4
ORDER BY Id
```

In [1]:
import numpy as np

import pandas as pd

import umap

from bokeh.plotting import ColumnDataSource, figure, show
from bokeh.io import output_notebook

from itembed import (
    pack_itemsets,
    initialize_syn,
    UnsupervisedTask,
    train,
    normalize,
)

  import numba.targets


In [2]:
# Initialize Bokeh
output_notebook()

In [3]:
# Load raw dataset
tag_df = pd.read_csv('stackoverflow.csv')
tag_df.head(10)

Unnamed: 0,id,tags
0,4,c#;floating-point;type-conversion;double;decimal
1,11,c#;datetime;time;datediff;relative-time-span
2,13,html;browser;timezone;user-agent;timezone-offset
3,16,c#;linq;web-services;.net-3.5
4,17,mysql;database;binary-data;data-storage
5,19,performance;algorithm;language-agnostic;unix;pi
6,25,c++;c;sockets;mainframe;zos
7,36,sql;sql-server;datatable;rdbms
8,39,c#;.net;vb.net;timer
9,42,php;plugins;architecture;hook


In [4]:
# Get tags as a list of list of string
itemsets = tag_df.tags.str.split(';').values

In [5]:
# Pack itemsets into contiguous arrays
labels, indices, offsets = pack_itemsets(itemsets, min_count=10)
num_label = len(labels)

In [6]:
# Initialize embeddings sets from uniform distribution
num_dimension = 64
syn0 = initialize_syn(num_label, num_dimension)
syn1 = initialize_syn(num_label, num_dimension)

In [7]:
# Define unsupervised task, i.e. using co-occurrences
task = UnsupervisedTask(indices, offsets, syn0, syn1, num_negative=5)

In [8]:
# Do training
# Note: due to a different sampling strategy, more epochs than word2vec are needed
train(task, num_epoch=100)

100%|██████████████████████████████████████████████████████████████████████| 1562400/1562400 [13:08<00:00, 1981.59it/s]


In [9]:
# Both embedding sets are equivalent, just choose one of them
syn = syn0

In [10]:
# Project with UMAP, using cosine similarity measure
model = umap.UMAP(metric='cosine', verbose=1)
projection = model.fit_transform(syn)

UMAP(a=None, angular_rp_forest=True, b=None,
   force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
   local_connectivity=1.0, low_memory=False, metric='cosine',
   metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
   n_neighbors=15, negative_sample_rate=5, output_metric='euclidean',
   output_metric_kwds=None, random_state=None, repulsion_strength=1.0,
   set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
   target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
   transform_queue_size=4.0, transform_seed=42, unique=False, verbose=1)
Construct fuzzy simplicial set
Fri May 22 11:28:13 2020 Finding Nearest Neighbors
Fri May 22 11:28:13 2020 Building RP forest with 11 trees
Fri May 22 11:28:17 2020 NN descent for 14 iterations
	 0  /  14
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	 6  /  14
	 7  /  14
	 8  /  14
	 9  /  14
Fri May 22 11:28:34 2020 Finished Nearest Neighbor Search




Fri May 22 11:28:40 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Fri May 22 11:28:52 2020 Finished embedding


In [11]:
# Pack as a Bokeh data source
source = ColumnDataSource(data=dict(
    x=projection[:, 0],
    y=projection[:, 1],
    label=labels,
))

# Create plot
p = figure(
    width=900,
    height=600,
    tooltips=[
        ('label', '@label'),
    ],
)

# Draw tags as points
p.scatter(
    'x', 'y',
    source=source,
)

# Show in notebook
show(p)

In [12]:
# Cosine similarity is equivalent to dot product with normalized vectors
syn_normalized = normalize(syn)

In [13]:
# Closest tags to apples
i = labels.index('word')
similarities = syn_normalized @ syn_normalized[i]
for j in np.argsort(-similarities)[:10]:
    print('#{} {}: {}'.format(j, labels[j], similarities[j]))

#2679 word: 1.0000001192092896
#13959 phrases: 0.7703588008880615
#7991 chars: 0.762029230594635
#11359 sentence: 0.7579058408737183
#4274 words: 0.7459167242050171
#11833 phrase: 0.7225632071495056
#7302 text-segmentation: 0.6989665031433105
#7988 punctuation: 0.692477822303772
#10785 word-frequency: 0.6828475594520569
#161 text: 0.6784793734550476
