In [16]:
import torch
from collections import Counter
import numpy as np

In [17]:
sentences=[
    "I love machine learning",
    "I love deep learning",
    "Natural language processing is fascinating"
]

In [18]:
tokens=[sentences.lower().split() for sentences in sentences]
token_flat=[word for sentences in tokens for word in sentences]


vocab=Counter(token_flat)
vocab_size=len(vocab)
word2dix={word:i for i,(word,_)in enumerate(vocab.items())}
idx2word={i:word for word,i in word2dix.items()}

window_size = 2
training_pairs = []

for sentence in tokens:
    for i, word in enumerate(sentence):
        target = word2dix[word]
        # Context words within the window size
        context_words = sentence[max(0, i - window_size): i] + sentence[i + 1: min(len(sentence), i + window_size + 1)]
        for context in context_words:
            training_pairs.append((target, word2dix[context]))

# Convert to tensor
train_data = torch.LongTensor(training_pairs)

In [19]:
import torch.nn as nn
import torch.optim as optim
class Word2VecSkipGram(nn.Module):
    def __init__(self,vocab_size,embedding_dim):
        super(Word2VecSkipGram,self).__init__()
        self.embeddings=nn.Embedding(vocab_size,embedding_dim)
    def forward(self,target):
        return self.embeddings(target)

embedding_dim=100
model=Word2VecSkipGram(vocab_size,embedding_dim)        

In [22]:
def train(model, train_data, epochs=1000, learning_rate=0.01):
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        total_loss = 0
        for target, context in train_data:
            model.zero_grad()
            target_tensor = torch.LongTensor([target])
            context_tensor = torch.LongTensor([context])

            # Forward pass
            target_embedding = model(target_tensor)
            context_embedding = model(context_tensor)

            # Calculate loss
            loss = loss_function(context_embedding, target_tensor)
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_data)}')

# Train the model
train(model, train_data)


Epoch 1, Loss: 1.1240388225106632
Epoch 2, Loss: 1.1240378513055689
Epoch 3, Loss: 1.1240368204958298
Epoch 4, Loss: 1.124035870327669
Epoch 5, Loss: 1.124034888604108
Epoch 6, Loss: 1.1240339279174805
Epoch 7, Loss: 1.1240329882677864
Epoch 8, Loss: 1.124032010050381
Epoch 9, Loss: 1.1240310143021977
Epoch 10, Loss: 1.1240300150478588
Epoch 11, Loss: 1.1240290508550757
Epoch 12, Loss: 1.1240280849092148
Epoch 13, Loss: 1.1240271364941317
Epoch 14, Loss: 1.1240261372397928
Epoch 15, Loss: 1.124025187071632
Epoch 16, Loss: 1.1240242158665377
Epoch 17, Loss: 1.1240232464145212
Epoch 18, Loss: 1.1240222734563492
Epoch 19, Loss: 1.1240213197820328
Epoch 20, Loss: 1.1240203590954052
Epoch 21, Loss: 1.1240193878903109
Epoch 22, Loss: 1.124018455252928
Epoch 23, Loss: 1.1240175015786116
Epoch 24, Loss: 1.1240165198550505
Epoch 25, Loss: 1.1240155644276564
Epoch 26, Loss: 1.124014559914084
Epoch 27, Loss: 1.12401365181979
Epoch 28, Loss: 1.1240126490592957
Epoch 29, Loss: 1.1240116953849792
Ep

In [23]:
word_vectors = model.embeddings.weight.data.numpy()

# Example: Get vector for a specific word
word_index = word2dix['machine']
print(f"Vector for 'machine': {word_vectors[word_index]}")


Vector for 'machine': [ 8.126439    8.129772    0.7252991   8.133117    0.25418976  0.49002394
 -1.0997905  -0.63741374 -0.6465995  -0.15473226  0.85699666 -1.4416742
 -0.9427264  -0.5531723   0.46146333  0.17594765  0.38562477 -1.7902838
 -1.5103159  -0.26706764 -0.08852424 -0.86723286 -0.3413107  -1.2261378
  0.83645266  0.6808256   0.42835283  0.32362387 -0.03676874  0.78955114
 -0.29805484  0.5212648   0.25227913  1.1987727  -0.6976225  -0.7375602
 -0.5263302  -1.4002227  -1.3707819  -0.6416115  -0.24766174  0.7462435
  0.4246255   0.16425882 -1.1846856   0.7377627  -1.9570324   0.05758411
 -1.3206551   0.16330306  0.11491979  0.8962373  -0.50969404  0.02866176
 -1.9531775  -1.289838    1.1487489  -0.36546615 -0.19983907 -0.2504535
 -0.11959472  0.51396155 -0.8894359  -0.5215879  -0.23827577  0.01499586
  0.97730565 -1.7139233  -0.36816004 -1.0194905   0.15408482  0.26004624
  0.23473375 -0.45649248 -0.93150574 -0.20459637  0.56511635 -0.21108636
 -0.06801974  0.47038355 -1.3199807