## Dataset Preparation

In [1]:
import torch 
import torch.nn as nn
import numpy as np
from collections import Counter

text = "I like playing football because playing football is fun."

tokens = text.lower().split()
vocab = set(tokens)
vocab_size = len(vocab)

word_to_idx = {j:i for i, j in enumerate(vocab)}
idx_to_word = {j:i for i, j in word_to_idx.items()}

window_size = 2

In [2]:
def create_skipgram_data(tokens, window_size):
    data = []
    for i in range(window_size, len(vocab) - window_size):
        target = tokens[i]
        context = tokens[i - window_size:i] + tokens[i + 1:i + window_size + 1]
        for word in context:
            data.append((target, word))
    return data

skipgram_data = create_skipgram_data(tokens, window_size)

In [3]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word):
        target_vector = self.embeddings(target_word)
        output = self.linear(target_vector)
        return output

In [4]:
embedding_dim = 50
skipgram_model = SkipGram(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(skipgram_model.parameters(), lr=0.01)

In [5]:
epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for target, context in skipgram_data:
        target_idx = torch.tensor([word_to_idx[target]], dtype=torch.long)
        context_idx = torch.tensor([word_to_idx[context]], dtype=torch.long)

        optimizer.zero_grad()
        output = skipgram_model(target_idx)
        loss = criterion(output, context_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 10, Loss: 13.2314
Epoch 20, Loss: 13.1655
Epoch 30, Loss: 13.1361
Epoch 40, Loss: 13.1166
Epoch 50, Loss: 13.1012
Epoch 60, Loss: 13.0878
Epoch 70, Loss: 13.0758
Epoch 80, Loss: 13.0646
Epoch 90, Loss: 13.0542
Epoch 100, Loss: 13.0443


In [6]:
skipgram_model.embeddings.weight.data

tensor([[-6.1812e-01,  4.8279e-01,  5.5118e-01, -1.7796e-01,  2.5179e+00,
          2.5021e-01,  7.2867e-01, -5.7593e-01,  7.7676e-01, -1.3737e+00,
         -3.2198e-01, -7.6998e-01, -5.0072e-01,  1.2330e-01,  4.0705e-01,
         -8.6523e-02,  7.5898e-01,  9.7002e-02, -1.0346e+00, -1.1127e-01,
         -1.8332e-01,  9.8378e-01,  1.6859e-01,  4.4303e-01,  2.2903e-01,
         -1.5132e-01,  2.2155e-01, -6.8202e-01, -3.8771e-01, -1.2499e+00,
          1.5576e+00, -6.7147e-02, -4.1470e-01, -8.1196e-01,  2.8030e-01,
          9.7454e-01,  6.3801e-01,  9.2477e-03, -3.0372e-01,  7.9786e-01,
         -1.1112e+00, -1.7083e+00,  1.2825e+00,  6.1507e-02, -3.0835e-01,
          3.8652e-01,  3.9943e-06,  1.6661e+00,  2.0936e-01, -2.2449e-01],
        [ 6.9256e-01, -1.2926e-01, -2.0364e+00, -4.7599e-01, -7.2472e-01,
         -8.3629e-01,  5.8204e-01, -1.4702e+00,  2.4645e-03, -3.7806e-01,
          8.5085e-01,  7.4890e-01, -7.1798e-01,  1.6889e+00,  3.9534e+00,
          2.6306e-01, -4.6721e-01,  4

In [7]:
def predict_skipgram(target, model, word_to_idx, idx_to_word, top_n=5):
    target_idx = torch.tensor([word_to_idx[target]], dtype=torch.long)
    with torch.no_grad():
        output = model(target_idx)
        predicted_probs = torch.softmax(output, dim=1)
        top_context_idxs = torch.topk(predicted_probs, top_n, dim=1).indices.squeeze(0).tolist()
    return [idx_to_word[idx] for idx in top_context_idxs]

# Test Skip-gram prediction
target_word = "playing"
predicted_context = predict_skipgram(target_word, skipgram_model, word_to_idx, idx_to_word, top_n=3)
print(f"Target Word: {target_word}, Predicted Context Words: {predicted_context}")

Target Word: playing, Predicted Context Words: ['because', 'football', 'like']


In [8]:
word_embeddings = skipgram_model.embeddings.weight.data

##  Finding Similar Words
* Use cosine similarity to find the most similar words in the embedding space.

In [9]:
from scipy.spatial.distance import cosine

def find_similar(word, word_embeddings, word_to_idx, idx_to_word, top_n=5):
    word_idx = word_to_idx[word]
    word_vector = word_embeddings[word_idx]
    similarities = []
    for idx, vec in enumerate(word_embeddings):
        if idx != word_idx:
            similarity = 1 - cosine(word_vector, vec)
            similarities.append((similarity, idx))
    top_similar = sorted(similarities, key=lambda x: x[0], reverse=True)[:top_n]
    return [(idx_to_word[idx], sim) for sim, idx in top_similar]

# Test finding similar words
similar_words = find_similar("football", word_embeddings, word_to_idx, idx_to_word)
print(f"Words similar to 'football': {similar_words}")

Words similar to 'football': [('fun.', 0.18023665558995083), ('because', 0.10099966195637167), ('i', -6.411839770459338e-05), ('like', -0.030076215151108254), ('playing', -0.10613631192153772)]
