In [3]:
# Downloading the Tiny Shakespeare dataset
!wget -q "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" -O tiny_shakespeare.txt

import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Reading and preprocessing the dataset
with open("tiny_shakespeare.txt", "r") as f:
    text = f.read().lower()

# Tokenizing
words = text.split()
print(f"Total tokens: {len(words)}")

#  taking 100k tokens only
words = words[:100000]
print(f"Using {len(words)} tokens for training.")

# Building the vocabulary
vocab = set(words)
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

model_type = input("enter the model type cbow or skipgram")

window_size = 2

def generate_cbow_data(words, word2idx, window_size):
    data = []
    for i in range(window_size, len(words) - window_size):
        context = []
        for j in range(i-window_size,i+window_size+1):
          if j!=i:
            context.append(word2idx[words[j]])
        target = word2idx[words[i]]
        data.append((context, target))
    return data

def generate_skipgram_data(words, word2idx, window_size):
    data = []
    for i in range(window_size, len(words) - window_size):
      target = word2idx[words[i]]
      for j in range(i-window_size,i+window_size+1):
        if j!=i:
          context = word2idx[words[j]]
          data.append((context, target))
    return data

#generating the data according to the model selected

if model_type == 'cbow':
    training_data = generate_cbow_data(words, word2idx, window_size)
    print(f"Total CBOW training pairs: {len(training_data)}")
elif model_type == 'skipgram':
    training_data = generate_skipgram_data(words, word2idx, window_size)
    print(f"Total Skip-gram training pairs: {len(training_data)}")
else:
    raise ValueError("Invalid model_type. Choose 'cbow' or 'skipgram'.")

# Defining the CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_words):
        embeds = self.embeddings(context_words)  # [batch_size, context_size, embedding_dim]
        avg_embeds = torch.mean(embeds, dim=1)     # Average over context words: [batch_size, embedding_dim]
        out = self.linear(avg_embeds)             # [batch_size, vocab_size]
        return out

# Defining The SkipGram model
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word):
        embeds = self.embeddings(target_word)
        out = self.linear(embeds)
        return out

# Hyperparameters
embedding_dim = 100

if model_type == 'cbow':
    model = CBOW(vocab_size, embedding_dim).to(device)
elif model_type == 'skipgram':
    model = SkipGram(vocab_size, embedding_dim).to(device)


# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
epochs = 10
print("\nStarting training...")
for epoch in range(epochs):
    total_loss = 0
    if model_type == 'cbow':
        # Each training example: (context, target)
        for context, target in training_data:
            # Converting to tensors and moving to device
            context_tensor = torch.tensor([context], dtype=torch.long).to(device)  # [1, context_size]
            target_tensor = torch.tensor([target], dtype=torch.long).to(device)    # [1]

            optimizer.zero_grad()
            logits = model(context_tensor)  # [1, vocab_size]
            loss = loss_function(logits, target_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
    elif model_type == 'skipgram':
        for target, context in training_data:
            target_tensor = torch.tensor([target], dtype=torch.long).to(device)
            context_tensor = torch.tensor([context], dtype=torch.long).to(device)

            optimizer.zero_grad()
            logits = model(target_tensor)  # [1, vocab_size]
            loss = loss_function(logits, context_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    avg_loss = total_loss / len(training_data)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

# After training,sample embeddings for a few words
embeddings = model.embeddings.weight.data.cpu()
print("\nSample embeddings for a few words:")
for word in list(word2idx.keys())[:5]:
    print(f"{word}: {embeddings[word2idx[word]]}")

Total tokens: 202651
Using 100000 tokens for training.
Vocabulary size: 14944
enter the model type cbow or skipgramskipgram
Total Skip-gram training pairs: 399984

Starting training...
Epoch 1/10, Average Loss: 7.7010
Epoch 2/10, Average Loss: 7.0525
Epoch 3/10, Average Loss: 6.8368
Epoch 4/10, Average Loss: 6.6935
Epoch 5/10, Average Loss: 6.5843
Epoch 6/10, Average Loss: 6.4964
Epoch 7/10, Average Loss: 6.4237
Epoch 8/10, Average Loss: 6.3623
Epoch 9/10, Average Loss: 6.3098
Epoch 10/10, Average Loss: 6.2644

Sample embeddings for a few words:
yorkshire,: tensor([-0.2072,  0.6668,  1.4050, -1.2013, -0.3773,  1.1720, -0.8441, -1.7788,
        -0.4114,  0.1093,  0.8068,  1.8879, -0.3630, -1.7464, -0.1808, -0.0508,
         0.6056, -1.4174, -0.6790,  0.6428, -2.2759,  0.3765, -0.6289, -1.3949,
         1.5634, -0.1261, -1.5581,  1.2216, -0.6249,  0.2331, -0.1328,  1.2876,
        -0.0466,  0.4289,  0.9631, -0.8077, -0.7650,  0.1858,  1.1256, -1.5491,
         0.2572, -0.6824, -0.7180,  