## Dataset Preparation

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

# Example data
text = "I like playing football because playing football is fun."

# Tokenize and preprocess
tokens = text.lower().split()
vocab = set(tokens)
vocab_size = len(vocab)

# Create word-to-index and index-to-word mappings
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Define window size for context
window_size = 2

In [2]:
# CBOW data preparation
def create_cbow_data(tokens, window_size):
    data = []
    for i in range(window_size, len(tokens) - window_size):
        context = tokens[i - window_size:i] + tokens[i + 1:i + window_size + 1]
        target = tokens[i]
        data.append((context, target))
    return data

cbow_data = create_cbow_data(tokens, window_size)

In [3]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_words):
        # Get embeddings for context words and average them
        context_vectors = self.embeddings(context_words)
        context_mean = context_vectors.mean(dim=1)
        output = self.linear(context_mean)
        return output

In [5]:
# Training CBOW model
embedding_dim = 50
cbow_model = CBOW(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cbow_model.parameters(), lr=0.01)

epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context, target in cbow_data:
        context_idx = torch.tensor([word_to_idx[word] for word in context], dtype=torch.long).unsqueeze(0)
        target_idx = torch.tensor([word_to_idx[target]], dtype=torch.long)

        optimizer.zero_grad()
        output = cbow_model(context_idx)
        loss = criterion(output, target_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 10, Loss: 0.2272
Epoch 20, Loss: 0.0696
Epoch 30, Loss: 0.0374
Epoch 40, Loss: 0.0237
Epoch 50, Loss: 0.0165
Epoch 60, Loss: 0.0122
Epoch 70, Loss: 0.0094
Epoch 80, Loss: 0.0075
Epoch 90, Loss: 0.0061
Epoch 100, Loss: 0.0051


In [6]:
# Training Skip-gram model
embedding_dim = 50
skipgram_model = SkipGram(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(skipgram_model.parameters(), lr=0.01)

epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for target, context in skipgram_data:
        target_idx = torch.tensor([word_to_idx[target]], dtype=torch.long)
        context_idx = torch.tensor([word_to_idx[context]], dtype=torch.long)

        optimizer.zero_grad()
        output = skipgram_model(target_idx)
        loss = criterion(output, context_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


Epoch 10, Loss: 29.0170
Epoch 20, Loss: 28.7953
Epoch 30, Loss: 28.6348
Epoch 40, Loss: 28.5015
Epoch 50, Loss: 28.3869
Epoch 60, Loss: 28.2869
Epoch 70, Loss: 28.1987
Epoch 80, Loss: 28.1206
Epoch 90, Loss: 28.0510
Epoch 100, Loss: 27.9887


In [7]:
cbow_model.embeddings.weight.data

tensor([[-1.0180,  0.3982, -0.4570,  0.9488,  3.2252,  0.0494,  0.2571, -1.4554,
          1.9020,  1.1556, -0.1488,  0.4014,  1.3506,  1.9969,  0.4816, -1.1947,
         -1.3340, -0.0808, -0.7911, -0.6684,  0.5220,  1.8570, -0.5459,  0.7342,
          0.9366, -1.3944, -1.4262, -0.7894, -1.3418, -0.1346, -1.8685,  1.4023,
          1.1101,  1.0732,  3.2738,  1.4829,  0.1860, -0.1798,  0.5820, -1.0874,
          0.9004,  1.0829, -0.9059,  0.1669, -1.0915, -0.0370,  1.1727, -0.2422,
          1.2343,  1.2383],
        [-0.2533,  0.6273,  1.0680, -0.1701,  1.1222, -1.4697,  1.8079, -1.1539,
          0.1433,  0.3029,  0.2832,  2.0881,  1.3713, -2.1716,  1.5298,  1.3365,
          1.9066,  0.0168,  1.4105, -0.6676, -0.9336,  1.0564, -2.1765,  2.1596,
         -0.6975, -0.0257, -1.2667, -0.8645,  0.5362,  0.2247,  0.1601,  0.3255,
          0.0267,  1.8353, -2.2286,  1.4602,  1.5984,  1.0063, -0.4191,  0.9502,
          0.3253, -0.6747, -0.8185, -1.5516, -1.5385,  0.5184, -0.7134,  0.4234,


In [8]:
def predict_cbow(context, model, word_to_idx, idx_to_word):
    context_idx = torch.tensor([word_to_idx[word] for word in context], dtype=torch.long).unsqueeze(0)
    with torch.no_grad():
        output = model(context_idx)
        predicted_idx = torch.argmax(output, dim=1).item()
    return idx_to_word[predicted_idx]

# Test CBOW prediction
context = ["i", "like", "football", "because"]
predicted_word = predict_cbow(context, cbow_model, word_to_idx, idx_to_word)
print(f"Context: {context}, Predicted Word: {predicted_word}")

Context: ['i', 'like', 'football', 'because'], Predicted Word: playing


In [11]:
word_embeddings = cbow_model.embeddings.weight.data

##  Finding Similar Words
* Use cosine similarity to find the most similar words in the embedding space.

In [None]:
from scipy.spatial.distance import cosine

def find_similar(word, word_embeddings, word_to_idx, idx_to_word, top_n=5):
    word_idx = word_to_idx[word]
    word_vector = word_embeddings[word_idx]
    similarities = []
    for idx, vec in enumerate(word_embeddings):
        if idx != word_idx:
            similarity = 1 - cosine(word_vector, vec)
            similarities.append((similarity, idx))
    top_similar = sorted(similarities, key=lambda x: x[0], reverse=True)[:top_n]
    return [(idx_to_word[idx], sim) for sim, idx in top_similar]

# Test finding similar words
similar_words = find_similar("football", word_embeddings, word_to_idx, idx_to_word)
print(f"Words similar to 'football': {similar_words}")

## Word Analogies
* Solve analogies like king - man + woman ≈ queen.

In [15]:
def word_analogy(word_a, word_b, word_c, word_embeddings, word_to_idx, idx_to_word):
    vec_a = word_embeddings[word_to_idx[word_a]]
    vec_b = word_embeddings[word_to_idx[word_b]]
    vec_c = word_embeddings[word_to_idx[word_c]]
    target_vector = vec_a - vec_b + vec_c
    similarities = []
    for idx, vec in enumerate(word_embeddings):
        similarity = 1 - cosine(target_vector, vec)
        similarities.append((similarity, idx))
    # Exclude input words from the result
    input_idxs = {word_to_idx[word_a], word_to_idx[word_b], word_to_idx[word_c]}
    top_match = max((sim, idx) for sim, idx in similarities if idx not in input_idxs)
    return idx_to_word[top_match[1]]

# Test word analogy
result = word_analogy("king", "man", "woman", word_embeddings, word_to_idx, idx_to_word)
print(f"'king' - 'man' + 'woman' ≈ '{result}'")

KeyError: 'king'