In [45]:
import io
import itertools
import numpy as np
import os
import re
import string
import tensorflow as tf
import tqdm

In [46]:
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [47]:
SEED = 42 
AUTOTUNE = tf.data.AUTOTUNE

In [48]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [49]:
vocab, index = {}, 1 # start indexing from 1
vocab['<pad>'] = 0 # add a padding token 
for token in tokens:
    if token not in vocab: 
        vocab[token] = index
        index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [50]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [51]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [52]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence, 
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams), _)

26 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [53]:
print(positive_skip_grams)

[[4, 2], [2, 3], [3, 5], [1, 4], [1, 3], [5, 6], [1, 7], [7, 6], [5, 3], [7, 1], [4, 3], [3, 4], [2, 4], [6, 1], [1, 2], [1, 6], [3, 2], [5, 4], [5, 1], [4, 5], [1, 5], [6, 5], [3, 1], [6, 7], [4, 1], [2, 1]]


In [54]:
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(4, 2): (shimmered, wide)
(2, 3): (wide, road)
(3, 5): (road, in)
(1, 4): (the, shimmered)
(1, 3): (the, road)


In [55]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context. 
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class, # class that should be sampled as 'positive'
    num_true=1, # each positive skip-gram has 1 positive context class
    num_sampled=num_ns, # number of negative context words to sample
    unique=True, # all the negative samples should be unique
    range_max=vocab_size, # pick index of the samples from [0, vocab_size]
    seed=SEED, # seed for reproducibility
    name="negative_sampling" # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([0 1 2 3], shape=(4,), dtype=int64)
['<pad>', 'the', 'wide', 'road']


In [56]:
# Add a dimension so you can use concatenation (on the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

print(negative_sampling_candidates)

# Concat positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label first context word as 1 (positive) followed by num_ns 0s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64") 

# Reshape target to shape (1,) and context and label to (num_ns+1,).
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label =  tf.squeeze(label)

tf.Tensor(
[[0]
 [1]
 [2]
 [3]], shape=(4, 1), dtype=int64)


In [57]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 4
target_word     : shimmered
context_indices : [2 0 1 2 3]
context_words   : ['wide', '<pad>', 'the', 'wide', 'road']
label           : [1 0 0 0 0]


In [58]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


https://papers.nips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf

Mikulov paper

In [59]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    print(sampling_table)
    
    
  # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence, 
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0)

        # Iterate over each positive skip-gram pair to produce training examples 
        # with positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
              tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
              true_classes=context_class,
              num_true=1, 
              num_sampled=num_ns, 
              unique=True, 
              range_max=vocab_size, 
              seed=SEED, 
              name="negative_sampling")

            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
              negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [60]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [61]:
path_to_file

'C:\\Users\\Vojta\\.keras\\datasets\\shakespeare.txt'

In [62]:
with open(path_to_file) as f: 
    lines = f.read().splitlines()
for line in lines[:20]:
    print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [63]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [64]:
text_ds

<FilterDataset shapes: (), types: tf.string>

In [65]:
# We create a custom standardization function to lowercase the text and 
# remove punctuation.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

# Define the vocabulary size and number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [66]:
vectorize_layer.adapt(text_ds.batch(1024))

In [67]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']


In [68]:
def vectorize_text(text):
    print(text)
    text = tf.expand_dims(text, -1)
    return tf.squeeze(vectorize_layer(text))

# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [69]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


In [70]:
print(sequences[0:6])

[array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int64), array([138,  36, 982, 144, 673, 125,  16, 106,   0,   0], dtype=int64), array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int64), array([106, 106,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int64), array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int64), array([   7,   41,   34, 1286,  344,    4,  200,   64,    4, 3690],
      dtype=int64)]


In [71]:
for seq in sequences[:5]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


In [90]:
targets, contexts, labels = generate_training_data(
    sequences=sequences, 
    window_size=2, 
    num_ns=4, 
    vocab_size=vocab_size, 
    seed=SEED)
print(len(targets), len(contexts), len(labels))

  1%|▊                                                            | 445/32777 [00:00<00:07, 4448.66it/s]

[0.00315225 0.00315225 0.00547597 ... 0.60335544 0.60343743 0.60351941]


100%|███████████████████████████████████████████████████████████| 32777/32777 [00:07<00:00, 4164.02it/s]

65377 65377 65377





In [94]:
targets

[673,
 673,
 673,
 673,
 106,
 89,
 1286,
 3690,
 3690,
 1286,
 1286,
 1286,
 592,
 592,
 592,
 592,
 93,
 93,
 93,
 93,
 2655,
 2655,
 2655,
 1390,
 1390,
 1390,
 1390,
 644,
 644,
 595,
 595,
 595,
 1780,
 595,
 595,
 1780,
 1780,
 595,
 39,
 39,
 39,
 131,
 131,
 131,
 131,
 40,
 40,
 40,
 2346,
 2346,
 40,
 2346,
 2346,
 2871,
 2871,
 2461,
 2461,
 2461,
 2461,
 1187,
 1187,
 1187,
 89,
 89,
 89,
 89,
 659,
 659,
 496,
 496,
 496,
 496,
 556,
 556,
 22,
 22,
 22,
 171,
 171,
 171,
 171,
 59,
 59,
 59,
 59,
 12,
 12,
 20,
 20,
 20,
 86,
 71,
 71,
 86,
 86,
 71,
 86,
 2022,
 2022,
 2022,
 2956,
 2956,
 2956,
 2956,
 49,
 446,
 446,
 270,
 270,
 1504,
 1504,
 1504,
 1504,
 60,
 2089,
 60,
 2089,
 60,
 60,
 628,
 628,
 628,
 2840,
 2840,
 2840,
 208,
 208,
 208,
 208,
 1780,
 1780,
 1610,
 1780,
 1780,
 1610,
 2870,
 2870,
 2870,
 25,
 25,
 25,
 25,
 1954,
 1954,
 1954,
 690,
 690,
 690,
 690,
 1623,
 1623,
 3150,
 1623,
 352,
 3150,
 1623,
 352,
 352,
 3150,
 3150,
 787,
 787,
 3612,


In [95]:
contexts

[<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
 array([[982],
        [364],
        [ 77],
        [ 14],
        [ 28]], dtype=int64)>,
 <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
 array([[ 125],
        [   1],
        [   3],
        [  59],
        [1856]], dtype=int64)>,
 <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
 array([[  16],
        [ 481],
        [   0],
        [  22],
        [4032]], dtype=int64)>,
 <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
 array([[ 144],
        [2298],
        [  74],
        [ 659],
        [ 163]], dtype=int64)>,
 <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
 array([[106],
        [  1],
        [  3],
        [177],
        [390]], dtype=int64)>,
 <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
 array([[ 270],
        [2187],
        [   1],
        [ 141],
        [  93]], dtype=int64)>,
 <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
 array([[  41],
        [   1],
        [   0],
        [1883],
        [   2]], dtype=int64)>,
 <tf.Tens

In [96]:
labels

[<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>,
 <tf.Tensor: sha

In [91]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [92]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [75]:
class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = Embedding(vocab_size, 
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding", )
        self.context_embedding = Embedding(vocab_size, 
                                           embedding_dim, 
                                           input_length=num_ns+1)
        self.dots = Dot(axes=(3,2))
        self.flatten = Flatten()

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)

In [76]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [77]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [80]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")


In [81]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2697fb21370>

In [84]:
%tensorboard --logdir logs

UsageError: Line magic function `%tensorboard` not found.


In [85]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [86]:
vocab

['',
 '[UNK]',
 'the',
 'and',
 'to',
 'i',
 'of',
 'you',
 'my',
 'a',
 'that',
 'in',
 'is',
 'not',
 'for',
 'with',
 'me',
 'it',
 'be',
 'your',
 'his',
 'this',
 'but',
 'he',
 'have',
 'as',
 'thou',
 'him',
 'so',
 'what',
 'thy',
 'will',
 'no',
 'by',
 'all',
 'king',
 'we',
 'shall',
 'her',
 'if',
 'our',
 'are',
 'do',
 'thee',
 'now',
 'lord',
 'good',
 'on',
 'o',
 'come',
 'from',
 'sir',
 'or',
 'which',
 'more',
 'then',
 'well',
 'at',
 'would',
 'was',
 'they',
 'how',
 'here',
 'she',
 'than',
 'their',
 'them',
 'ill',
 'duke',
 'am',
 'hath',
 'say',
 'let',
 'when',
 'one',
 'go',
 'were',
 'love',
 'may',
 'us',
 'make',
 'upon',
 'yet',
 'richard',
 'like',
 'there',
 'must',
 'should',
 'an',
 'first',
 'why',
 'queen',
 'had',
 'know',
 'man',
 'did',
 'tis',
 'where',
 'see',
 'some',
 'too',
 'death',
 'give',
 'who',
 'these',
 'take',
 'speak',
 'edward',
 'york',
 'mine',
 'such',
 'up',
 'out',
 'henry',
 'romeo',
 'can',
 'father',
 'tell',
 'time',
 

In [87]:
weights

array([[ 0.0341787 , -0.01023943, -0.00485866, ..., -0.00732268,
        -0.04417688, -0.02064382],
       [ 0.13962647, -0.26861814,  0.05983509, ..., -0.06853311,
         0.0178367 ,  0.046607  ],
       [ 0.2678029 , -0.06859203, -0.11908495, ..., -0.17488095,
         0.03519425,  0.12001862],
       ...,
       [ 0.21250707,  0.00800893, -0.14988169, ..., -0.13405937,
         0.10454278, -0.17526893],
       [ 0.10683385,  0.21014605, -0.39542514, ..., -0.08317842,
        -0.04608944,  0.03693198],
       [-0.18942982, -0.17860767, -0.1748381 , ..., -0.17548388,
        -0.05703614, -0.12867194]], dtype=float32)

In [88]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it's padding.
    vec = weights[index] 
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

https://web.stanford.edu/class/cs224n/readings/cs224n-2019-notes01-wordvecs1.pdf