## Word2Vec (Word embedding)

Implement Word2Vec algorithm to compute vector representations of words, with TensorFlow 2.0. This example is using a small chunk of Wikipedia articles to train from.

In [5]:
from __future__ import division, print_function, absolute_import

import collections 
import os
import random
import urllib.request
import zipfile

import numpy as np
import tensorflow as tf

In [6]:
# Training params
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000

# Evaluation parameters
eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']

# Word2vec paramters
embedding_size = 200 # Dimension of the embedding vector
max_vocabulary_size = 50000 # Number of different words in the vocabulary
min_occurrence = 10 # Remove all words that don't appear atleast n times
# Can remove stopwords as well. Not used for now
skip_window = 3 # How many words to consider to the left and right
num_skips = 2 # No. of times to reuse an input to generate a label
num_sampled = 64 # number of negative examples to sample

In [7]:
# Download a small chunk of wikipedia article collections
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'text8.zip'
if not os.path.exists(data_path):
    print("Downloading the dataset... (It may take some time)")
    filename, _ = urllib.request.urlretrieve(url, data_path)
    print("Done!")
# Unzip the dataset file. Text has already been processed.
with zipfile.ZipFile(data_path) as f:
    text_words = f.read(f.namelist()[0]).lower().split()

In [9]:
# Build the dictionary and replace the rare words with the UNK token
count = [('UNK', -1)]
# Retrieve the most common words
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))

# Remove samples with less than 'min_occurrence' occurrences.
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so stop when 'min_occurrence' is reached.
        break
        
# Compute the vocab size
vocabulary_size = len(count)
# Assign an id to each word
word2id = dict()
for i, (word, _) in enumerate(count):
    word2id[word] = i
    
data = list()
unk_count = 0
for word in text_words:
    # Retrieve a word id, or assign it index 0 (UNK) if not in dictionary
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))

print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:10])


Words count: 17005207
Unique words: 253854
Vocabulary size: 47135
Most common words: [('UNK', 444176), (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764), (b'in', 372201), (b'a', 325873), (b'to', 316376), (b'zero', 264975), (b'nine', 250430)]
