## Downloading the Data

In [3]:
import urllib
import urllib.request
import os

In [4]:
HOME_DIR = './'
DATA_DIR = os.path.join(HOME_DIR, 'data')

In [5]:
def download_file(url, target_dir):
    target_file = os.path.join(target_dir, os.path.basename(url))
    urllib.request.urlretrieve(url, target_file)
    assert os.path.isfile(target_file), "File not succesfully downloaded"

In [6]:
download_file("https://raw.githubusercontent.com/jhlau/doc2vec/master/toy_data/train_docs.txt", DATA_DIR)

In [7]:
download_file("https://raw.githubusercontent.com/jhlau/doc2vec/master/toy_data/test_docs.txt", DATA_DIR)

## doc2vec training 

In this part, we will train a Paragraph Vectors / doc2vec model using gensim. You can find information on the gensim doc2vec api here: https://radimrehurek.com/gensim/models/doc2vec.html

N.B. You should be using Python 3 for this.

The data folder contains a train and test set with small sets of documents from the "20 newsgroups" dataset.

What we're going to do is the following:
* Read a dataset with documents
* Transform each document into a list of tokens (words)
* Train a doc2vec model (DM)
* Train a second model (DBOW)
* Inspect the outcomes a bit

In [14]:
import os
from gensim.models import doc2vec
from gensim.utils import simple_preprocess

In [15]:
# generic settings
HOMEDIR = './'
CORPUS_FILE = os.path.join(HOMEDIR, "data/train_docs.txt")

# file names for the models we'll be creating
MODEL_FILE_DM = os.path.join(HOMEDIR, "models/doc2vec_DM_v20210404.bin")
MODEL_FILE_DBOW = os.path.join(HOMEDIR, "models/doc2vec_DBOW_v20210404.bin")

In [16]:
pwd

'/Users/sara/Google Drive/doc2vec-workshop-master'

**Read the corpus. Each line is a document / paragraph. Optionally preprocess it first.**

In [17]:
flg_preprocess = False

if flg_preprocess:
    # quick & simple approach
    docs = doc2vec.TaggedLineDocument(CORPUS_FILE)
else:
    # with pre-processing
    with open(CORPUS_FILE, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        docs = [simple_preprocess(line, deacc=False, min_len=1) for line in lines]
        docs = [doc2vec.TaggedDocument(doc, tags=[i]) for i, doc in enumerate(docs)]

In [18]:
# have a look at the data
docs[0]

TaggedDocument(words=['anarchism', 'is', 'a', 'political', 'philosophy', 'that', 'advocates', 'self', 'governed', 'societies', 'with', 'voluntary', 'institutions', 'these', 'are', 'often', 'described', 'as', 'stateless', 'societies', 'but', 'several', 'authors', 'have', 'defined', 'them', 'more', 'specifically', 'as', 'institutions', 'based', 'on', 'non', 'hierarchical', 'free', 'associations', 'anarchism', 'holds', 'the', 'state', 'to', 'be', 'undesirable', 'unnecessary', 'or', 'harmful', 'while', 'anti', 'statism', 'is', 'central', 'anarchism', 'entails', 'opposing', 'authority', 'or', 'hierarchical', 'organisation', 'in', 'the', 'conduct', 'of', 'human', 'relations', 'including', 'but', 'not', 'limited', 'to', 'the', 'state', 'system'], tags=[0])

In [19]:
len(docs)

1000

## Training a DM (Distributed Memory) model

In [20]:
# train DM model
model_dm = doc2vec.Doc2Vec(docs, 
                          # size=200, # vector size, should be the same size as pre-trained embedding size when not using dm_concat
                           window=10, # window size for word context, on each side
                           min_count=1, # minimum nr. of occurrences of a word
                           sample=1e-5, # threshold for undersampling high-frequency words
                           workers=4, # for multicore processing
                           hs=0, # if 1, use hierarchical softmax; if 0, use negative sampling
                           dm=1, # if 1 use PV-DM, if 0 use PM-DBOW
                           negative=5, # how many words to use for negative sampling
                           dbow_words=1, # train word vectors
                           dm_concat=1, # concatenate vectors or sum/average them?
                        #   iter=100 # nr of epochs to train
                          )

In [21]:
# save it for later use
model_dm.save(MODEL_FILE_DM)

## Training a DBOW (Distributed Bag Of Words) model

In [36]:
# train DBOW model
model_dbow = doc2vec.Doc2Vec(docs, 
                          #  size=200, # vector size, should be the same size as pre-trained embedding size when not using dm_concat
                            window=10, # window size for word context, on each side
                            min_count=1, # minimum nr. of occurrences of a word
                            sample=1e-5, # threshold for undersampling high-frequency words
                            workers=4, # for multicore processing
                            hs=0, # if 1, use hierarchical softmax; if 0, use negative sampling
                            dm=0, # if 1 use PV-DM, if 0 use PM-DBOW
                            negative=5, # how many words to use for negative sampling
                            dbow_words=1, # train word vectors
                       #     iter=100 # nr of epochs to train
                            )

In [37]:
# also save this one
model_dbow.save(MODEL_FILE_DBOW)

## **Question: Look at the model files that are now created in the models directory. Can you explain why there are 2 files for the DM model, but only 1 for the DBOW model?**

In [38]:
def show_most_similar(model, docs, ref_doc_id):
    """
    For a given document, display the most similar ones in the corpus
    """
    def print_doc(doc_id):
        doc_txt = ' '.join(docs[doc_id].words)
        print("[Doc {}]: {}".format(doc_id, doc_txt))
        
    print("[Original document]")
    print_doc(ref_doc_id)
    print("\n[Most similar documents]")
    for doc_id, similarity in model.docvecs.most_similar(ref_doc_id, topn=3):
        print("-----------------")
        print("similarity: {}".format(similarity))
        print_doc(doc_id)


In [39]:
show_most_similar(model_dbow, list(docs), 200)

[Original document]
[Doc 200]: single scattering albedo is used to define scattering of electromagnetic waves on small particles it depends on properties of the material lrb refractive index rrb the size of the particle or particles and the wavelength of the incoming radiation

[Most similar documents]
-----------------
similarity: 0.9979395270347595
[Doc 250]: a company of cavalry soldiers from huntsville alabama joined nathan bedford forrest s battalion in hopkinsville kentucky the company wore new uniforms with yellow trim on the sleeves collar and coat tails this led to them being greeted with yellowhammer and the name later was applied to all alabama troops in the confederate army
-----------------
similarity: 0.9979302883148193
[Doc 335]: many commercial technology companies are headquartered in huntsville such as the network access company adtran computer graphics company intergraph design and manufacturer of it infrastructure avocent and provider deltacom cinram manufactures an

  for doc_id, similarity in model.docvecs.most_similar(ref_doc_id, topn=3):


## Prediction phase

In [40]:
test_data_file = os.path.join(HOMEDIR, "data/test_docs.txt")

In [41]:
# read test data: each line into a list of tokens
with open(test_data_file, "r") as f:
    test_docs = [ x.strip().split() for x in f.readlines() ]

In [42]:
# inference hyper-parameters
start_alpha=0.01
infer_epoch=1000

Create the embeddings for the test documents. Remember: this is an inference step that actually trains a network.

In [43]:
test_docvecs = [model_dm.infer_vector(d, alpha=start_alpha, steps=infer_epoch) for d in test_docs]

In [44]:
# see what one document embedding looks like
test_docvecs[0]

array([-3.6078885e-03,  2.4099608e-03, -1.5234356e-03, -3.7185373e-03,
        3.6129707e-03, -2.1931138e-03, -3.5841845e-03, -2.5726026e-03,
        2.8449746e-03, -3.8756663e-03,  1.0904120e-03, -7.3614280e-04,
        2.6318184e-03, -1.9826654e-04,  3.0851443e-04, -4.2312625e-03,
        2.0832699e-03,  3.5922434e-03,  5.5884646e-04, -3.6848725e-03,
       -4.5044329e-03,  4.6895659e-03, -2.0833074e-03,  1.4661247e-03,
       -2.6929993e-03,  4.3324786e-03, -3.1219765e-03,  1.4456222e-03,
        2.0738251e-03,  1.2590468e-03,  1.5912824e-03, -3.4159571e-03,
        3.1452572e-03, -1.7045770e-03,  2.6687214e-03,  2.7725513e-03,
        4.4544777e-03, -1.6557779e-04, -2.5399209e-03,  2.8612791e-03,
       -1.9347386e-03, -4.5050815e-04,  3.1429667e-03,  1.3073338e-03,
       -1.3034290e-04, -6.8750128e-04, -2.5468534e-03, -4.9226894e-03,
        4.6723452e-03, -4.7310749e-03, -1.7769994e-03,  3.2929475e-03,
       -3.8980865e-03,  4.3779421e-03, -2.9601860e-03, -3.4347062e-03,
      

# Evaluating a doc2vec model

What we're going to do in this exercise:
* load a pre-trained doc2vec model
* use it to infer document embeddings for our test set
* cluster the documents based on the embeddings cosine distances
* use t-SNE to visualize the data

In [45]:
import os
import numpy as np
from gensim.models import doc2vec
from gensim.utils import simple_preprocess
from nltk.cluster import kmeans
from nltk.cluster import util
import collections

In [46]:
# generic settings
HOMEDIR = './'

In [47]:
CORPUS_FILE = os.path.join(HOMEDIR, "data/train_docs.txt")
MODEL_FILE_DM = os.path.join(HOMEDIR, "models/doc2vec_DM_v20210404.bin")
MODEL_FILE_DBOW = os.path.join(HOMEDIR, "models/doc2vec_DBOW_v20210404.bin")

NUM_CLUSTERS = 20  # yes, you can change this

## Read corpus file and parse into token lists

In [48]:
with open(CORPUS_FILE, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    docs = [simple_preprocess(line, deacc=False, min_len=1) for line in lines]

# Read existing model and use it to derive document embeddings

In [49]:
# load pre-trained model
model = doc2vec.Doc2Vec.load(MODEL_FILE_DM)  # DM model chosen by default
#model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)  # only keep what we need

# Combine DM and DBOW models
The authors of the paper suggest that combining the DM and the DBOW model works better than any single one. Do this by concatenating (you could also try to averaging or summing) the embeddings from both models.

In [50]:
model_dm = doc2vec.Doc2Vec.load(MODEL_FILE_DM)
model_dbow = doc2vec.Doc2Vec.load(MODEL_FILE_DBOW)

docvecs_dm = [model_dm.infer_vector(d, alpha=0.01, steps=1000) for d in docs]
docvecs_dbow = [model_dbow.infer_vector(d, alpha=0.01, steps=1000) for d in docs]

docvecs = [docvecs_dm[i] + docvecs_dbow[i] for i, d in enumerate(docs)]

In [51]:
# infer document vectors
docvecs = [model.infer_vector(d, alpha=0.01, steps=1000) for d in docs]

# Now we have document vectors, start clustering

In [52]:
clusterer = kmeans.KMeansClusterer(NUM_CLUSTERS, distance=util.cosine_distance, repeats=3)

In [53]:
cluster_assignments = clusterer.cluster(docvecs, assign_clusters=True)

In [54]:
# how many documents per cluster?
collections.Counter(cluster_assignments)

Counter({17: 52,
         12: 43,
         2: 41,
         16: 90,
         13: 47,
         7: 55,
         6: 57,
         8: 49,
         1: 39,
         5: 40,
         15: 51,
         18: 46,
         11: 64,
         14: 41,
         0: 51,
         10: 53,
         9: 50,
         19: 43,
         3: 49,
         4: 39})

In [55]:
def get_documents_in_cluster(cluster_idx):
    return [doc for i, doc in enumerate(docs) if cluster_assignments[i] == cluster_idx]

In [56]:
def get_document_topics(doc_vec, topic_vecs):
    """
    For a given document, give the topic distribution (softmax probabilities for all topics)
    """
    similarities = [np.dot(doc_vec, topic_vec) for topic_vec in topic_vecs]
    return np.exp(similarities) / np.sum(np.exp(similarities))

You can define the topics as the cluster centroids. Then find the nearest-neighbor words to describe the topic.

In [57]:
topic_vecs = clusterer.means()

# Visualize topics using t-SNE
What we're going to do now:
* reduce 100-dim vector space to 2 dimensions
* plot all documents in this 2D space
* use color to show the clustering
* inspect how close / afar certain documents are

In [58]:
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.models import HoverTool
from bokeh.io import push_notebook, output_notebook, show

In [59]:
docs_tsne = TSNE(n_components=2, perplexity=30, init='pca').fit_transform(docvecs)
docs_tsne.shape

(1000, 2)

In [60]:
# create matrix with topic proportion per doc per topic
doc_topic_matrix = [get_document_topics(docvec, topic_vecs) for docvec in docvecs]
# select highest topic prob
prob_max_topic = np.max(doc_topic_matrix, axis=1)

In [61]:
# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [62]:
sourcedata = {
    'x': docs_tsne[:, 0],
    'y': docs_tsne[:, 1],
    'color': colormap[cluster_assignments],
    'alpha': prob_max_topic * 50,
    'content': lines,
    'topic_key': cluster_assignments
}

# Make and show the plot

In [63]:
tsne_plot = bp.figure(plot_width=1600, plot_height=900,
                      title="Topics",
                      tools="pan,wheel_zoom,box_zoom,reset,hover,save",
                      x_axis_type=None, y_axis_type=None, min_border=1)

tsne_plot.scatter(x='x', 
                  y='y',
                  color='color',
                  size='alpha',
                  #size=10,
                  source=bp.ColumnDataSource(sourcedata)
                 )

# add hover tooltips
hover = tsne_plot.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

show(tsne_plot)