In [None]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [None]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))
print(tokens)

8
['the', 'wide', 'road', 'shimmered', 'in', 'the', 'hot', 'sun']


In [None]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [None]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [None]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [None]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))
print(positive_skip_grams)

26
[[1, 2], [4, 3], [1, 5], [1, 4], [4, 2], [5, 6], [6, 1], [2, 1], [5, 1], [1, 7], [3, 4], [5, 4], [6, 5], [4, 5], [1, 6], [3, 5], [5, 3], [4, 1], [2, 4], [2, 3], [3, 2], [7, 1], [7, 6], [3, 1], [1, 3], [6, 7]]


In [None]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(1, 2): (the, wide)
(4, 3): (shimmered, road)
(1, 5): (the, in)
(1, 4): (the, shimmered)
(4, 2): (shimmered, wide)


In [None]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])


tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['wide', 'the', 'shimmered', 'road']


In [None]:
# Add a dimension so you can use concatenation (on the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concat positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label first context word as 1 (positive) followed by num_ns 0s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape target to shape (1,) and context and label to (num_ns+1,).
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 1
target_word     : the
context_indices : [2 2 1 4 3]
context_words   : ['wide', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


In [None]:
print("target  :", target)
print("context :", context)
print("label   :", label)

target  : tf.Tensor(1, shape=(), dtype=int32)
context : tf.Tensor([2 2 1 4 3], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [None]:

# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels


**Larger set of data--**

Downloading the Shakespeare data--

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


Reading the lines of the text.

In [None]:
with open(path_to_file) as f: 
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


Creates a Dataset comprising lines from one or more text files.--tf.data.TextLineDataset()

In [None]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))
for element in text_ds.as_numpy_iterator():
  print(element)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
b"To have't with saying 'Good morrow.'"
b'SICINIUS:'
b'For that he has,'
b'As much as in him lies, from time to time'
b'Envied against the people, seeking means'
b'To pluck away their power, as now at last'
b'Given hostile strokes, and that not in the presence'
b'Of dreaded justice, but on the ministers'
b"That do distribute it; in the name o' the people"
b'And in the power of us the tribunes, we,'
b'Even from this instant, banish him our city,'
b'In peril of precipitation'
b'From off the rock Tarpeian never more'
b"To enter our Rome gates: i' the people's name,"
b'I say it shall be so.'
b'Citizens:'
b'It shall be so, it shall be so; let him away:'
b"He's banish'd, and it shall be so."
b'COMINIUS:'
b'Hear me, my masters, and my common friends,--'
b'SICINIUS:'
b"He's sentenced; no more hearing."
b'COMINIUS:'
b'Let me speak:'
b'I have been consul, and can show for Rome'
b"Her enemies' marks upon me. I do love"
b"My country'

KeyboardInterrupt: ignored

use the TextVectorization layer to vectorize sentences from the corpus.

In [None]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')
# Define the vocabulary size and number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the TextVectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

Call adapt on the text dataset to create vocabulary.

In [None]:
vectorize_layer.adapt(text_ds.batch(1024))


Retrieve vocabulary by getVocabulary()

In [None]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:10])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a']


The vectorize_layer can now be used to generate vectors for each element in the text_ds.

In [None]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
for element in text_vector_ds.as_numpy_iterator():
  print(element)

[ 89 270   0   0   0   0   0   0   0   0]
[138  36 982 144 673 125  16 106   0   0]
[34  0  0  0  0  0  0  0  0  0]
[106 106   0   0   0   0   0   0   0   0]
[ 89 270   0   0   0   0   0   0   0   0]
[   7   41   34 1286  344    4  200   64    4 3690]
[34  0  0  0  0  0  0  0  0  0]
[1286 1286    0    0    0    0    0    0    0    0]
[ 89 270   0   0   0   0   0   0   0   0]
[  89    7   93 1187  225   12 2442  592    4    2]
[34  0  0  0  0  0  0  0  0  0]
[  36 2655   36 2655    0    0    0    0    0    0]
[ 89 270   0   0   0   0   0   0   0   0]
[  72   79  506   27    3   56   24 1390   57   40]
[644   9   1   0   0   0   0   0   0   0]
[34  0  0  0  0  0  0  0  0  0]
[  32   54 2863  885   72   17   18  163  146  146]
[165 270   0   0   0   0   0   0   0   0]
[ 74 218  46 595   0   0   0   0   0   0]
[ 89 270   0   0   0   0   0   0   0   0]
[  36   41    1  172  595    2 1780   46    0    0]
[  29 1323    1   47   58    1   79   39   60    0]
[ 58 573  79  22   2   1 334  17  76

KeyboardInterrupt: ignored

Obtain sequences from the dataset to iterate over each sentence in the dataset to produce positive and negative examples.

In [None]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


Example sequences-

In [None]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


call the generate_training_data() function which iterates over each word from each sequence to collect positive and negative context words. 

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 32777/32777 [00:40<00:00, 802.70it/s] 




targets.shape: (64626,)
contexts.shape: (64626, 5)
labels.shape: (64626, 5)


Batching the dataset

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5)), (1024, 5)), types: ((tf.int64, tf.int64), tf.int64)>


In [None]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5)), (1024, 5)), types: ((tf.int64, tf.int64), tf.int64)>


The Word2Vec model is implemented as a classifier to distinguish between true context words from skip-grams and false context words obtained through negative sampling. 

In [None]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

Instantiate the Word2Vec class

In [None]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

Training the model-

In [None]:
word2vec.fit(dataset, epochs=10, callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2570f4c490>

-- Questions for Understanding--

- Given the sentence “I like to cuddle dogs”, how many skipgrams are created with a window size of 2?

The skipgrams are-7

I,like;

I,to; ****

like,to;

like,cuddle;

to,cuddle

to,dogs

cuddle,dogs

- In general, how does the number of skipgrams relate to the size of the dataset (in terms of input-target pairs)?

Skip-gram predicts surrounding context words from the target words. 
Skip-gram treats each context-target pair as a new observation, and this tends to do better when we have larger datasets as in that case the number of target-context pair will increase in terms of all posiible combination.

- Why is it not a good idea to compute the full softmax for classification?

The training objective of the skip-gram model is to maximize the probability of predicting context words given the target word. The basic skip-gram formulation defines this probability using the softmax function.

-Computing the denominator of the softmax formulation involves performing a full softmax over the entire vocabulary words which is very large.

-To determine the probability we need to carry out a sum of all the words in our vocabulary.

-So if the vocabulary size is very large then it becomes computationally inefficient and slow to sum up the denominator.

sol--split that to binary classification problem, instead of softmax

- The way the dataset is created, for a given (target, context) pair, are the negative samples (remember, these are randomly sampled) the same each time this training example is seen, or are they different?

No the negative examples will be different becuase we are generating the negative examples randomly with respect to a positive example for each ieration.

- For the given example dataset (Shakespeare), would the code create (target, context) pairs for sentences that span multiple lines? For example, the last word of one line and the first word of the next line?

I think they can, because of the tokenization of the elements.

- Does the code generate skipgrams for padding characters (index 0)?

Yes. Because, skip-gram pairs are generated from the sequence with a window_size of 2 from tokens in the range [0, vocab_size).

- The skipgrams function uses a “sampling table”. In the code, this is shown to be a simple list of probabilities, and it is created without any reference to the actual text data. How/why does this work? I.e. how does the program “know” which words to sample with which probability?

The tf.random.log_uniform_candidate_sampler already assumes that the vocabulary frequency follows a log-uniform (Zipf's) distribution.