<a href="https://colab.research.google.com/github/sujithkumarmp/google-colab/blob/main/gen-ai/cbow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

# Sample corpus
corpus = [
    "the quick brown fox jumps over the lazy dog",
    "the dog barks at the fox",
    "the fox is quick and the dog is lazy"
]

# Preprocessing: Tokenize and build vocabulary
def preprocess(corpus):
    tokenized_corpus = [sentence.lower().split() for sentence in corpus]
    vocab = set(word for sentence in tokenized_corpus for word in sentence)
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return tokenized_corpus, word_to_idx, idx_to_word

tokenized_corpus, word_to_idx, idx_to_word = preprocess(corpus)

# Generate CBOW training data
def generate_cbow_data(tokenized_corpus, window_size=2):
    data = []
    for sentence in tokenized_corpus:
        for idx, word in enumerate(sentence):
            context = []
            for w in range(-window_size, window_size + 1):
                if w != 0 and 0 <= idx + w < len(sentence):
                    context.append(sentence[idx + w])
            target = word
            data.append((context, target))
    return data

cbow_data = generate_cbow_data(tokenized_corpus)

# Convert data to indices
def data_to_indices(cbow_data, word_to_idx):
    indexed_data = []
    for context, target in cbow_data:
        context_indices = [word_to_idx[word] for word in context]
        target_index = word_to_idx[target]
        indexed_data.append((context_indices, target_index))
    return indexed_data

indexed_data = data_to_indices(cbow_data, word_to_idx)

# CBOW Model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embedded = self.embeddings(context).mean(dim=0)
        out = self.linear(embedded)
        return out

# Hyperparameters
embedding_dim = 10
vocab_size = len(word_to_idx)
model = CBOWModel(vocab_size, embedding_dim)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training
epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context_indices, target_index in indexed_data:
        context_tensor = torch.tensor(context_indices, dtype=torch.long)
        target_tensor = torch.tensor([target_index], dtype=torch.long)

        optimizer.zero_grad()
        output = model(context_tensor)
        loss = criterion(output.unsqueeze(0), target_tensor)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Testing embeddings
word_embeddings = model.embeddings.weight.data
for word, idx in word_to_idx.items():
    print(f"Word: {word}, Embedding: {word_embeddings[idx]}")


Epoch 10, Loss: 55.0607
Epoch 20, Loss: 51.2209
Epoch 30, Loss: 48.4199
Epoch 40, Loss: 45.9719
Epoch 50, Loss: 43.7157
Epoch 60, Loss: 41.5963
Epoch 70, Loss: 39.5877
Epoch 80, Loss: 37.6735
Epoch 90, Loss: 35.8398
Epoch 100, Loss: 34.0738
Word: barks, Embedding: tensor([ 1.1789,  0.5315,  0.5461,  0.3272, -0.0911, -0.0412,  1.0795, -0.1773,
         0.3417, -0.0418])
Word: brown, Embedding: tensor([-0.7878,  1.9682, -0.8545,  0.6659,  0.5404,  0.2591, -1.5156,  0.3505,
         0.6164,  0.5290])
Word: lazy, Embedding: tensor([ 0.8970, -1.6502, -2.2585,  0.7454,  0.9933,  0.0064, -1.0858, -0.3764,
        -0.7918,  1.1950])
Word: over, Embedding: tensor([-0.7783,  1.5325,  0.8425, -0.7505,  0.3283, -0.1816, -0.6441,  1.0973,
         0.2709, -0.0450])
Word: is, Embedding: tensor([ 1.0826, -0.1952,  0.3646, -0.7235, -0.8089, -0.5043, -0.1451,  1.0848,
         0.2767, -0.7324])
Word: jumps, Embedding: tensor([ 0.1541,  1.1651,  0.5360,  1.4104, -2.5237, -2.3134, -0.0139,  0.3956,
     