## Word2Vec (Word embedding)

Implement Word2Vec algorithm to compute vector representations of words, with TensorFlow 2.0. This example is using a small chunk of Wikipedia articles to train from.

In [1]:
from __future__ import division, print_function, absolute_import

import collections 
import os
import random
import urllib.request
import zipfile

import numpy as np
import tensorflow as tf

In [2]:
# Training params
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000

# Evaluation parameters
eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']

# Word2vec paramters
embedding_size = 200 # Dimension of the embedding vector
max_vocabulary_size = 50000 # Number of different words in the vocabulary
min_occurrence = 10 # Remove all words that don't appear atleast n times
# Can remove stopwords as well. Not used for now
skip_window = 3 # How many words to consider to the left and right
num_skips = 2 # No. of times to reuse an input to generate a label
num_sampled = 64 # number of negative examples to sample

In [4]:
# Download a small chunk of wikipedia article collections
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'text8.zip'
if not os.path.exists(data_path):
    print("Downloading the dataset... (It may take some time)")
    filename, _ = urllib.request.urlretrieve(url, data_path)
    print("Done!")
# Unzip the dataset file. Text has already been processed.
with zipfile.ZipFile(data_path) as f:
    text_words = f.read(f.namelist()[0]).lower().split()

Downloading the dataset... (It may take some time)
Done!


In [None]:
# Build the dictionary and replace the rare words with the UNK token
count = [('UNK', -1)]
# Retrieve the most common words
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))

# Remove samples with less than 'min_occurrence' occurrences.
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so stop when 'min_occurrence' is reached.
        break