In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Sample text data (tokenized)
corpus = "the quick brown fox jumps over the lazy dog".split()

# Create a vocabulary and word-to-index mapping
vocab = set(corpus)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)

# Hyperparameters
embedding_dim = 50
context_window = 1  # Range of words to consider before and after the current word
epochs = 100
learning_rate = 0.001

# Create training data in the form of (target, context) pairs
data = []
for i, target_word in enumerate(corpus):
    for j in range(max(0, i - context_window), min(i + context_window + 1, len(corpus))):
        if i != j:
            data.append((word_to_idx[target_word], word_to_idx[corpus[j]]))

# Define the Skip-Gram model
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target):
        embedded = self.embeddings(target)
        predicted = self.linear(embedded)
        return predicted

# Initialize the model and optimizer
model = SkipGram(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    total_loss = 0
    np.random.shuffle(data)
    for target, context in data:
        target_tensor = torch.LongTensor([target])
        context_tensor = torch.LongTensor([context])

        optimizer.zero_grad()
        output = model(target_tensor)
        loss = loss_fn(output, context_tensor)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}] Loss: {total_loss:.4f}")

# Get word embeddings
word_embeddings = model.embeddings.weight.detach().numpy()

# Example usage: Find similar words
def find_similar_words(word, top_n=5):
    if word in word_to_idx:
        word_vector = word_embeddings[word_to_idx[word]]
        cosine_similarities = np.dot(word_embeddings, word_vector) / (
            np.linalg.norm(word_embeddings, axis=1) * np.linalg.norm(word_vector)
        )
        most_similar_idx = np.argsort(cosine_similarities)[::-1][1:top_n + 1]
        most_similar_words = [idx_to_word[idx] for idx in most_similar_idx]
        return most_similar_words
    else:
        return []

# Example usage:
target_word = "fox"
similar_words = find_similar_words(target_word)
print(f"Words similar to '{target_word}': {similar_words}")


Epoch [10/100] Loss: 25.3676
Epoch [20/100] Loss: 18.4093
Epoch [30/100] Loss: 15.5657
Epoch [40/100] Loss: 14.2045
Epoch [50/100] Loss: 13.6089
Epoch [60/100] Loss: 13.2303
Epoch [70/100] Loss: 12.8657
Epoch [80/100] Loss: 12.6795
Epoch [90/100] Loss: 12.5705
Epoch [100/100] Loss: 12.4006
Words similar to 'fox': ['jumps', 'quick', 'over', 'brown', 'the']
