In [26]:
# AIM: Implement Continuous Bag of Words (CBOW) Model

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [27]:
# ----- Stage a: Data Preparation -----
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets",
    "the mat is soft and warm"
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(corpus)

In [28]:
# ----- Stage b: Generate Training Data -----
def generate_training_data(sequences, window_size=2):
    contexts = []
    targets = []
    
    for seq in sequences:
        for i in range(window_size, len(seq) - window_size):
            context = seq[i - window_size:i] + seq[i + 1:i + window_size + 1]
            target = seq[i]
            contexts.append(context)
            targets.append(target)
    return pad_sequences(contexts, maxlen=2 * window_size), np.array(targets)

X, y = generate_training_data(sequences)

In [29]:
# ----- Stage c: CBOW Model -----

# build the model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=10, input_length=X.shape[1]),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(total_words, activation='softmax')
])

# compile the model
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy'
])

# train the model
model.fit(X, y, epochs=100)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.8353
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.0000e+00 - loss: 2.8315
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.1250 - loss: 2.8276
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.1250 - loss: 2.8237
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.1250 - loss: 2.8199
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.3750 - loss: 2.8160
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.3750 - loss: 2.8121
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - accuracy: 0.3750 - loss: 2.8082
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1739605b2c0>

In [30]:
# ----- Stage d: Output -----
embeddings = model.layers[0].get_weights()[0]

print("Vocabulary size:", len(tokenizer.word_index))

print("{:<10} | {}".format("Word", "Embedding"))
print("-" * 45)

for w, i in tokenizer.word_index.items():
    print(f"{w:<10} | {np.round(embeddings[i], 3)}")

Vocabulary size: 16
Word       | Embedding
---------------------------------------------
the        | [-0.272  0.048  0.044  0.278 -0.283  0.025 -0.074  0.008  0.25  -0.282]
sat        | [-0.21  -0.214 -0.143  0.141 -0.139 -0.098  0.173 -0.113  0.131 -0.175]
on         | [-0.166  0.129  0.191  0.199 -0.129  0.195 -0.151  0.15   0.178 -0.106]
mat        | [-0.198 -0.205 -0.019  0.012 -0.188 -0.09   0.131  0.03   0.156 -0.155]
and        | [-0.046  0.013 -0.048 -0.035 -0.096 -0.096  0.168  0.176  0.065  0.004]
cat        | [-0.145  0.01  -0.04   0.138 -0.144  0.061 -0.07  -0.012  0.138 -0.105]
dog        | [-0.132 -0.001 -0.012  0.179 -0.163  0.006 -0.011 -0.002  0.169 -0.196]
log        | [-0.075 -0.124 -0.076  0.145 -0.083 -0.096  0.123 -0.13   0.098 -0.136]
cats       | [ 0.135  0.01  -0.083 -0.122 -0.07  -0.089  0.072  0.072 -0.071  0.094]
dogs       | [ 0.136  0.102 -0.128  0.113  0.127  0.123  0.146  0.094  0.07  -0.091]
are        | [ 0.142 -0.    -0.109 -0.058 -0.089 -0.062  0.13