In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import text
from keras.utils import pad_sequences
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [4]:
tokenizer = text.Tokenizer()

In [5]:
corpus = [
    # numbers
    "5 2 4 8 6 2 3 6 4",
    "4 8 5 6 9 5 5 6",
    "1 1 5 2 3 3 8",
    "3 6 9 6 8 7 4 6 3",
    "8 9 9 6 1 4 3 4",
    "1 0 2 0 2 1 3 3 3 3 3",
    "9 3 3 0 1 4 7 8",
    "9 9 8 5 6 7 1 2 3 0 1 0",

    # alphabets, expecting that 9 is close to letters
    "a t g q e h 9 u f",
    "e q y u o i p s",
    "q o 9 p l k j o k o k p",
    "h g y i u t a t e q",
    "i k d q r e 9 e a d",
    "o p d g 9 s a f g a",
    "i u y g h k l a s w",
    "o l u y a o g f s",
    "o p i u y g d a s j d l",
    "u k i l o 9 l j s",
    "y g i s h k j l f r f",
    "i o h n 9 9 d 9 f a 9",
]

In [6]:
tokenizer.fit_on_texts(corpus)
word2id = tokenizer.word_index
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}

In [7]:
corpus = [line.split(' ') for line in corpus]

In [8]:
wids = [[word2id[w] for w in sentence] for sentence in corpus]
vocab_size = len(word2id)

In [9]:
embed_size = 30
window_size = 2

In [10]:
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 31
Vocabulary Sample: [('9', 1), ('3', 2), ('o', 3), ('6', 4), ('a', 5), ('1', 6), ('g', 7), ('i', 8), ('4', 9), ('8', 10)]


In [11]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    pairs = []
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = context_words
            y = label_word
            pairs.append((x, y))
    return pairs

pairs = generate_context_word_pairs(wids, 2, vocab_size)

In [12]:
for pair in pairs[:5]:
  print(pair[0], pair[1])

[[16, 9]] [15]
[[15, 9, 10]] [16]
[[15, 16, 10, 4]] [9]
[[16, 9, 4, 16]] [10]
[[9, 10, 16, 2]] [4]


In [13]:
context_length = window_size*2
# xs = []
# ys = []
# for pair in pairs:
#   xs.append(sequence.pad_sequences(pair[0], maxlen=context_length))
#   ys.append(tf.one_hot(pair[1], vocab_size))

dataset = [(tf.constant(pad_sequences(pair[0], maxlen=context_length)), tf.one_hot(pair[1], vocab_size))for pair in pairs]

2022-06-24 14:24:34.293651: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
for x, y in dataset[:3]:
  print(x, y)

tf.Tensor([[ 0  0 16  9]], shape=(1, 4), dtype=int32) tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]], shape=(1, 31), dtype=float32)
tf.Tensor([[ 0 15  9 10]], shape=(1, 4), dtype=int32) tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]], shape=(1, 31), dtype=float32)
tf.Tensor([[15 16 10  4]], shape=(1, 4), dtype=int32) tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]], shape=(1, 31), dtype=float32)


In [19]:
class CBOW(keras.Model):
    def __init__(self, v_dim, emb_dim):
        super().__init__()
        self.v_dim = v_dim
        self.embeddings = keras.layers.Embedding(
            input_dim=v_dim, output_dim=emb_dim,  # [n_vocab, emb_dim]
            embeddings_initializer=keras.initializers.RandomNormal(0., 0.1),
        )

        # noise-contrastive estimation
        self.nce_w = self.add_weight(
            name="nce_w", shape=[v_dim, emb_dim],
            initializer=keras.initializers.TruncatedNormal(0., 0.1))  # [n_vocab, emb_dim]
        self.nce_b = self.add_weight(
            name="nce_b", shape=(v_dim,),
            initializer=keras.initializers.Constant(0.1))  # [n_vocab, ]

        self.opt = keras.optimizers.Adam(0.01)

    def call(self, x, training=None, mask=None):
        # x.shape = [n, skip_window*2]
        o = self.embeddings(x)          # [n, skip_window*2, emb_dim]
        o = tf.reduce_mean(o, axis=1)   # [n, emb_dim]
        return o

    # negative sampling: take one positive label and num_sampled negative labels to compute the loss
    # in order to reduce the computation of full softmax
    def loss(self, x, y, training=None):
        embedded = self.call(x, training)
        return tf.reduce_mean(
            tf.nn.nce_loss(
                weights=self.nce_w, biases=self.nce_b, labels=tf.expand_dims(y, axis=1),
                inputs=embedded, num_sampled=5, num_classes=self.v_dim))

    def step(self, x, y):
        with tf.GradientTape() as tape:
            loss = self.loss(x, y, True)
            grads = tape.gradient(loss, self.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.trainable_variables))
        return loss.numpy()


In [20]:
def train(model, data):
    for t in range(2500):
        bx, by = data.sample(8)
        loss = model.step(bx, by)
        if t % 200 == 0:
            print("step: {} | loss: {}".format(t, loss))


In [16]:
m = CBOW(vocab_size, 2)
train(m, d)

Epoch: 20 	Loss: 487.0900717973709

Epoch: 40 	Loss: 426.8071655035019

Epoch: 60 	Loss: 384.61249509453773

Epoch: 80 	Loss: 357.22441962361336

Epoch: 100 	Loss: 339.1613831669092

Epoch: 120 	Loss: 325.25390772521496

Epoch: 140 	Loss: 312.34187307953835

Epoch: 160 	Loss: 300.3946758583188

Epoch: 180 	Loss: 288.66503236442804

Epoch: 200 	Loss: 277.7807198241353

Epoch: 220 	Loss: 269.140151232481

Epoch: 240 	Loss: 262.41783217713237

Epoch: 260 	Loss: 257.26165649294853

Epoch: 280 	Loss: 253.13631142303348

Epoch: 300 	Loss: 249.9606466051191

Epoch: 320 	Loss: 246.67349866963923

Epoch: 340 	Loss: 243.65800760500133

Epoch: 360 	Loss: 240.99420899525285

Epoch: 380 	Loss: 238.40035833604634

Epoch: 400 	Loss: 235.6903238594532

Epoch: 420 	Loss: 233.2212741812691

Epoch: 440 	Loss: 231.10808439739048

Epoch: 460 	Loss: 229.39862131606787

Epoch: 480 	Loss: 227.3760985219851

Epoch: 500 	Loss: 225.3469477649778

Epoch: 520 	Loss: 223.60188008891419

Epoch: 540 	Loss: 222.357610

In [None]:
def show_w2v_word_embedding(model, path):
    word_emb = model.get_weights()[0]
    word_emb = word_emb[1:]
    for i in range(vocab_size -1):
        c = "blue"
        try:
            int(id2word[i])
        except ValueError:
            c = "red"
        plt.text(word_emb[i, 0], word_emb[i, 1], s=id2word[i], color=c, weight="bold")
    plt.xlim(word_emb[:, 0].min() - .5, word_emb[:, 0].max() + .5)
    plt.ylim(word_emb[:, 1].min() - .5, word_emb[:, 1].max() + .5)
    plt.xticks(())
    plt.yticks(())
    plt.xlabel("embedding dim1")
    plt.ylabel("embedding dim2")
    plt.savefig(path, dpi=300, format="png")
    plt.show()

show_w2v_word_embedding(cbow, './cbow.png')