In [1]:
# Word2Vec Implementation

# Import necessary libraries

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from google.colab import drive

In [2]:
# Define the Word2Vec model class

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, target_word, context_word):
        target_embed = self.in_embed(target_word)
        context_embed = self.out_embed(context_word)
        return target_embed, context_embed

In [4]:
# Define the training function

class Word2VecDataset(Dataset):
    def __init__(self, corpus, window_size):
        self.corpus = corpus
        self.window_size = window_size
        self.word_pairs = self.generate_word_pairs()

    def generate_word_pairs(self):
        word_pairs = []
        for i, center_word in enumerate(self.corpus):
            context_words = self.corpus[max(0, i - self.window_size):i] + self.corpus[i + 1:min(len(self.corpus) - 1, i + self.window_size) + 1]
            for context_word in context_words:
                word_pairs.append((center_word, context_word))
        return word_pairs

    def __len__(self):
        return len(self.word_pairs)

    def __getitem__(self, idx):
        target_word, context_word = self.word_pairs[idx]
        return torch.tensor(target_word), torch.tensor(context_word)

def train_word2vec(corpus, window_size, embedding_dim, num_epochs, learning_rate):
    # Preprocess the corpus and build the vocabulary
    corpus = corpus.lower().split()
    vocab = set(corpus)
    word2idx = {word: i for i, word in enumerate(vocab)}
    idx2word = {i: word for i, word in enumerate(vocab)}
    vocab_size = len(vocab)

    dataset = Word2VecDataset([word2idx[word] for word in corpus], window_size)
    dataloader = DataLoader(dataset, shuffle=True)

    # Create the target-context word pairs


    # Initialize the Word2Vec model
    model = Word2Vec(vocab_size, embedding_dim)

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = learning_rate)

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in enumerate(dataloader):
            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            target_word, context_word = data
            target_embed, context_embed = model(torch.LongTensor([target_word]), torch.LongTensor([context_word]))
            context_weights = model.out_embed.weight.data
            logits = torch.matmul(context_weights, target_embed.squeeze()).unsqueeze(0) # (vocab_size)
            softmax = nn.Softmax(dim=1)
            preds = softmax(logits)

            # Compute the loss
            loss = criterion(preds, torch.LongTensor([context_word]))

            # Backward pass
            loss.backward()

            # Update the model parameters
            optimizer.step()

            # Accumulate the loss
            running_loss += loss.item()

        # Print the average loss for the epoch
        print('Epoch [%d/%d], Loss: %.4f' % (epoch+1, num_epochs, running_loss/len(dataloader)))

    # Return the trained Word2Vec model
    return model, word2idx, idx2word, vocab_size, vocab

def cosine_similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))


In [5]:
# Define the main function


def main():
    # Set hyperparameters
    corpus = "I love to learn deep learning. It is fascinating!"
    window_size = 3
    embedding_dim = 10
    num_epochs = 100
    learning_rate = 0.001

    # Load and preprocess the corpus

    # Train the Word2Vec model
    model, word2idx, idx2word, vocab_size, vocab = train_word2vec(corpus, window_size, embedding_dim, num_epochs, learning_rate)

    # Evaluate the trained model using word similarity or analogy tasks
    predicted_scores = []

    for _item in vocab:
      word1_idx = word2idx.get(_item)
      print(_item + " similarities: => ")
      for item in vocab:
        word2_idx = word2idx.get(item)
        embedding1 = model.in_embed(torch.tensor(word1_idx)).detach().numpy()
        embedding2 = model.in_embed(torch.tensor(word2_idx)).detach().numpy()
        predicted_score = cosine_similarity(embedding1, embedding2)
        print(item + " " + str(predicted_score))
      print('\n')



    # Print the learned word embeddings
    embedding_matrix = model.in_embed.weight.data.numpy()
    for i, word in enumerate(model.in_embed.weight.data):
        print("Word: {}, Embedding: {}".format(idx2word[i], word))


    # Save the trained model
    drive.mount('/content/drive')
    torch.save(model, '/content/drive/My Drive/DLAssignments/07/model.pth')
    torch.save(model.state_dict(), '/content/drive/My Drive/DLAssignments/07/model_state_dict.pth')

# Run the main function
if __name__ == "__main__":
    main()


Epoch [1/100], Loss: 2.2536
Epoch [2/100], Loss: 2.2523
Epoch [3/100], Loss: 2.2509
Epoch [4/100], Loss: 2.2493
Epoch [5/100], Loss: 2.2476
Epoch [6/100], Loss: 2.2459
Epoch [7/100], Loss: 2.2442
Epoch [8/100], Loss: 2.2423
Epoch [9/100], Loss: 2.2403
Epoch [10/100], Loss: 2.2382
Epoch [11/100], Loss: 2.2361
Epoch [12/100], Loss: 2.2341
Epoch [13/100], Loss: 2.2321
Epoch [14/100], Loss: 2.2299
Epoch [15/100], Loss: 2.2280
Epoch [16/100], Loss: 2.2257
Epoch [17/100], Loss: 2.2237
Epoch [18/100], Loss: 2.2219
Epoch [19/100], Loss: 2.2197
Epoch [20/100], Loss: 2.2178
Epoch [21/100], Loss: 2.2163
Epoch [22/100], Loss: 2.2143
Epoch [23/100], Loss: 2.2128
Epoch [24/100], Loss: 2.2116
Epoch [25/100], Loss: 2.2100
Epoch [26/100], Loss: 2.2088
Epoch [27/100], Loss: 2.2074
Epoch [28/100], Loss: 2.2063
Epoch [29/100], Loss: 2.2050
Epoch [30/100], Loss: 2.2041
Epoch [31/100], Loss: 2.2029
Epoch [32/100], Loss: 2.2021
Epoch [33/100], Loss: 2.2010
Epoch [34/100], Loss: 2.1999
Epoch [35/100], Loss: 2