# Skip-gram word2vec

In [2]:
import io
import re
import string
import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

import warnings 
warnings.filterwarnings('ignore')

In [3]:
SEED = 2023
AUTOTUNE = tf.data.AUTOTUNE

## Intuition

In [4]:
sentence = 'The quick brown fox jumps over the lazy dog'

In [6]:
# tokenize and count vocab size
tokens = list(sentence.lower().split())
print(len(tokens))
print(tokens)

9
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


Next, we map the words to numbers. the inverse vocab is dictionary of index as key and vocab as value

In [9]:
# Create a vocabulary ; tokens to integer indices:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
# Create an inverse vocabulary ; integer indices to tokens:
inverse_vocab = {index: token for token, index in vocab.items()}
print(vocab)
print(inverse_vocab)

{'<pad>': 0, 'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumps': 5, 'over': 6, 'lazy': 7, 'dog': 8}
{0: '<pad>', 1: 'the', 2: 'quick', 3: 'brown', 4: 'fox', 5: 'jumps', 6: 'over', 7: 'lazy', 8: 'dog'}


In [10]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)
print(tokens)

[1, 2, 3, 4, 5, 6, 1, 7, 8]
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


The tf.keras.preprocessing.sequence module provides useful functions that simplify data preparation for word2vec. You can use the tf.keras.preprocessing.sequence.skipgrams to generate skip-gram pairs from the example_sequence with a given window_size from tokens in the range.

### positive sample generation

Using a window size of 2, we generate the list of all possible positive training samples given the example sentence.

In [11]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
                                                                    example_sequence,
                                                                    vocabulary_size=vocab_size, 
                                                                    window_size=window_size, # 2
                                                                    negative_samples=0 # library cannot correctly generate negative sample so we set at 0
                                                                   )
print(len(positive_skip_grams))

30


In [12]:
for target, context in positive_skip_grams[:10]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(5, 6): (jumps, over)
(7, 1): (lazy, the)
(1, 5): (the, jumps)
(6, 5): (over, jumps)
(3, 1): (brown, the)
(5, 1): (jumps, the)
(3, 2): (brown, quick)
(1, 8): (the, dog)
(6, 7): (over, lazy)
(7, 8): (lazy, dog)


### negative sample generation

The skipgrams function returns all positive skip-gram pairs by sliding over a given window span. To produce additional skip-gram pairs that would serve as negative samples for training, you need to sample random words from the vocabulary. Use the tf.random.log_uniform_candidate_sampler function to sample num_ns number of negative samples for a given target word in a window. You can call the function on one skip-grams's target word and pass the context word as true class to exclude it from being sampled.

In [15]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[1]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([2 1 0 5], shape=(4,), dtype=int64)
['quick', 'the', '<pad>', 'jumps']


In [16]:
print(negative_sampling_candidates.shape)
print(context_class.shape)

(4,)
(1, 1)


In [17]:
# Add a dimension so you can use concatenation (in the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

In [18]:
# Reshape the target to shape `(1,)` and context and label to `(num_ns+1,)`.
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 7
target_word     : lazy
context_indices : [1 2 1 0 5]
context_words   : ['the', 'quick', 'the', '<pad>', 'jumps']
label           : [1 0 0 0 0]


# Train Skip-gram model

In [19]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

## Load data 
shakespeare from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt

In [20]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


data pre-processing

In [21]:
# lower case and remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation), '')

# Define the vocab size and max sequence.
vocab_size = 4096
sequence_length = 10

# create vectorization layer to pre-process data
# normalize, split, and map strings to integers.
vectorize_layer = layers.TextVectorization(
                                            standardize=custom_standardization, # data prep
                                            max_tokens=vocab_size, # max tokens
                                            output_mode='int',
                                            output_sequence_length=sequence_length # pad all samples to the same length.
                                           )

In [22]:
# Call TextVectorization.adapt on the text dataset to create vocabulary.
# batch of 1024
vectorize_layer.adapt(text_ds.batch(1024))

In [23]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(len(inverse_vocab), 'words in vocab => ' ,inverse_vocab[:10])

4096 words in vocab =>  ['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a']


In [24]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))
for seq in sequences[:5]: # see first 5
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

32777
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


## Generate training examples
sequences is now a list of int encoded sentences. Just call the generate_training_data function defined earlier to generate training examples for the word2vec model. To recap, the function iterates over each word from each sequence to collect positive and negative context words. Length of target, contexts and labels should be the same, representing the total number of training examples.

In [25]:
targets, contexts, labels = generate_training_data(
                                                    sequences=sequences,
                                                    window_size=2,
                                                    num_ns=4,
                                                    vocab_size=vocab_size,
                                                    seed=SEED
                                                   )

100%|██████████| 32777/32777 [01:16<00:00, 430.59it/s]


In [26]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")



targets.shape: (65835,)
contexts.shape: (65835, 5)
labels.shape: (65835, 5)


To perform efficient batching for the potentially large number of training examples, use the tf.data.Dataset API. After this step, you would have a tf.data.Dataset object of (target_word, context_word), (label) elements to train your word2vec model!

In [27]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


## word2vec model

In [28]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                             embedding_dim,
                                             input_length=1,
                                             name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                              embedding_dim,
                                              input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [29]:
embedding_dim = 50
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [34]:
word2vec.fit(dataset, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2789b068f40>

# save word embedding

In [38]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [43]:
print(vocab[5:6], weights[2:3])

['i'] [[-0.21401028  0.21716307 -0.17976478 -0.6316895   0.10166845  0.252107
   0.7220733   0.27261996 -0.41566148 -0.00573596  0.00827512  0.06708087
  -0.71460193 -0.7662184  -0.20834157  1.2057359  -0.28015885 -0.23546633
  -0.09241361 -0.21304241 -0.11482741  0.7592964   0.1791624  -0.47790322
  -0.7363419   0.37624288  0.5388307   0.41706997  0.5708758  -0.03669158
  -0.42881346 -0.2034348   0.05349772 -0.32028213  0.11303154  0.85656285
  -0.09738274  0.0042937  -0.06563117 -0.66657156  0.27259076 -0.17700285
   0.31220567 -0.12347134 -0.50662225  0.19906326 -0.3557012   0.16824552
   0.570906    0.06169809]]


In [44]:
out_v = io.open('emb_vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('vocab.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [49]:
weights[2:3]

array([[-0.21401028,  0.21716307, -0.17976478, -0.6316895 ,  0.10166845,
         0.252107  ,  0.7220733 ,  0.27261996, -0.41566148, -0.00573596,
         0.00827512,  0.06708087, -0.71460193, -0.7662184 , -0.20834157,
         1.2057359 , -0.28015885, -0.23546633, -0.09241361, -0.21304241,
        -0.11482741,  0.7592964 ,  0.1791624 , -0.47790322, -0.7363419 ,
         0.37624288,  0.5388307 ,  0.41706997,  0.5708758 , -0.03669158,
        -0.42881346, -0.2034348 ,  0.05349772, -0.32028213,  0.11303154,
         0.85656285, -0.09738274,  0.0042937 , -0.06563117, -0.66657156,
         0.27259076, -0.17700285,  0.31220567, -0.12347134, -0.50662225,
         0.19906326, -0.3557012 ,  0.16824552,  0.570906  ,  0.06169809]],
      dtype=float32)