In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import itertools

# Sample data (a small corpus of text)
sentences = [
    "I love machine learning and deep learning",
    "The continuous bag of words model is great for NLP",
    "Word embeddings help in finding semantic meaning",
    "The CBOW model is a popular word2vec method"
]

# a. Data Preparation
# Tokenize and create vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}
vocab_size = len(word2idx) + 1  # +1 for padding
window_size = 2  # Number of context words to take around a target word

# Convert sentences to sequences of indices
sequences = tokenizer.texts_to_sequences(sentences)

# b. Generate Training Data
def generate_training_data(sequences, window_size, vocab_size):
    targets, contexts = [], []
    
    for sequence in sequences:
        for idx, word_id in enumerate(sequence):
            # Define the context window
            start = max(0, idx - window_size)
            end = min(len(sequence), idx + window_size + 1)
            
            # Context words exclude the target word itself
            context_words = [sequence[i] for i in range(start, end) if i != idx]
            for context_word in context_words:
                targets.append(word_id)  # Target word
                contexts.append(context_word)  # Context word
    
    # Convert targets and contexts to numpy arrays
    targets = np.array(targets)
    contexts = np.array(contexts)
    return targets, contexts

# Generate the target-context pairs
targets, contexts = generate_training_data(sequences, window_size, vocab_size)

# c. Train Model
embedding_dim = 50  # Dimension of the embedding vector

# Define CBOW Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1),  # Embedding layer
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),  # Averaging context word embeddings
    Dense(vocab_size, activation='softmax')  # Output layer with softmax
])

model.compile(optimizer=Adam(learning_rate=0.01), loss='sparse_categorical_crossentropy')

# Train the model using context words to predict the target word
history = model.fit(contexts, targets, epochs=50, batch_size=16, verbose=2)

# d. Output
# Display embeddings for a sample word
word = "model"  # Word for which we want to see embedding
if word in word2idx:
    word_idx = word2idx[word]
    embedding = model.layers[0].get_weights()[0][word_idx]
    print(f"Embedding for the word '{word}':\n{embedding}")
else:
    print(f"Word '{word}' not found in vocabulary.")
