<a href="https://colab.research.google.com/github/sujithkumarmp/google-colab/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from itertools import chain

# Sample corpus
corpus = [
    "the quick brown fox jumps over the lazy dog",
    "the dog barked at the fox",
    "the fox ran away quickly"
]

# Preprocessing: Tokenize and build vocabulary
tokenized_corpus = [sentence.split() for sentence in corpus]
vocab = list(set(chain(*tokenized_corpus)))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Generate n-grams (context, target pairs)
def generate_skipgram_data(tokenized_corpus, window_size=2):
    data = []
    for sentence in tokenized_corpus:
        for idx, word in enumerate(sentence):
            for neighbor in range(-window_size, window_size + 1):
                if neighbor != 0 and 0 <= idx + neighbor < len(sentence):
                    data.append((word, sentence[idx + neighbor]))
    return data

data = generate_skipgram_data(tokenized_corpus)

# Convert data to indices
data_idx = [(word_to_idx[target], word_to_idx[context]) for target, context in data]

# Model definition
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target):
        embed = self.embeddings(target)
        out = self.output_layer(embed)
        return out

# Hyperparameters
embedding_dim = 10
learning_rate = 0.01
epochs = 100

# Initialize model, loss, and optimizer
model = Word2Vec(len(vocab), embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for target, context in data_idx:
        target_tensor = torch.tensor([target], dtype=torch.long)
        context_tensor = torch.tensor([context], dtype=torch.long)

        optimizer.zero_grad()
        output = model(target_tensor)
        loss = criterion(output, context_tensor)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Extract embeddings
embeddings = model.embeddings.weight.data
print("Word Embeddings:")
for word, idx in word_to_idx.items():
    print(f"{word}: {embeddings[idx]}")


Epoch 10, Loss: 141.2042
Epoch 20, Loss: 133.9104
Epoch 30, Loss: 129.3713
Epoch 40, Loss: 125.7105
Epoch 50, Loss: 122.4689
Epoch 60, Loss: 119.5148
Epoch 70, Loss: 116.8338
Epoch 80, Loss: 114.4493
Epoch 90, Loss: 112.3806
Epoch 100, Loss: 110.6280
Word Embeddings:
barked: tensor([-1.2124, -0.8199,  0.2841,  1.2619, -1.6186, -0.9090, -0.1290, -0.8578,
        -0.2008, -0.9770])
brown: tensor([ 0.6599, -0.3906, -1.4428,  0.0633, -0.8937,  0.6748, -0.6055,  1.1687,
        -1.6470,  0.7958])
lazy: tensor([ 1.3022,  0.1532, -0.7149, -0.0339, -2.4266, -0.9491, -0.9831,  2.0436,
         1.6752,  1.8785])
over: tensor([ 0.7557,  0.5107, -0.9613, -0.5901, -0.6587,  0.8117,  2.6235,  1.2232,
        -0.4878,  1.9021])
away: tensor([-0.5040,  0.0285, -1.3486, -0.8679, -0.2043,  0.8493, -0.0173, -0.1751,
         0.1447, -0.4218])
quickly: tensor([ 1.1940,  0.5059, -1.5792, -1.5513, -0.1756,  0.3794, -0.7068,  0.5551,
        -0.3265,  1.3256])
jumps: tensor([-0.4065,  0.2968,  2.2834, -0.689