In [25]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.sequence import skipgrams

In [26]:
# Sample text data
corpus = ["I love natural language processing",
          "Continuous Bag of Words is a model",
          "Word embeddings capture semantic meaning"]

In [27]:
# Data preparation
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

In [28]:
# Generate training data
def generate_cbow_data(corpus, window_size=1):
    sequences = tokenizer.texts_to_sequences(corpus)
    data = []
    for sequence in sequences:
        pairs, labels = skipgrams(sequence, vocab_size, window_size=window_size, negative_samples=1.0)
        for pair, label in zip(pairs, labels):
            target_word, context_word = pair
            data.append(([target_word, context_word], label))
    return data

window_size = 1
cbow_data = generate_cbow_data(corpus, window_size=window_size)
X_train, y_train = zip(*cbow_data)
X_train = np.array(X_train)
y_train = np.array(y_train)

In [29]:
# Define CBOW model
embedding_dim = 50
cbow_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=2),
    tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])


In [30]:
# Compile the model
cbow_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [31]:
# Train model
cbow_model.fit(X_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x20cb97a6910>

In [32]:
# Output code
print("CBOW Model Summary:")
print(cbow_model.summary())

CBOW Model Summary:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 50)             900       
                                                                 
 lambda (Lambda)             (None, 50)                0         
                                                                 
 dense (Dense)               (None, 18)                918       
                                                                 
Total params: 1818 (7.10 KB)
Trainable params: 1818 (7.10 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
