In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import re

In [20]:
class GaussianKernelEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, sigma=1.0):
        super().__init__()
        self.embedding_weights = nn.Parameter(torch.randn(vocab_size, embedding_dim, dtype = torch.float64))
        self.sigma = sigma

    def forward(self, context, center):
        context_vecs = torch.matmul(context, self.embedding_weights) # batch_size * (winlen - 1) * embedding
        center_vec = torch.matmul(center, self.embedding_weights) # batch_size * embedding
        diff = context_vecs - center_vec.unsqueeze(1)  # batch_size * (winlen - 1) * embedding
        dist_sq = torch.sum(diff ** 2, dim=2)  # batch_size * (winlen - 1)
        weights = torch.exp(-dist_sq / (2 * self.sigma ** 2))  # batch_size * (winlen - 1)
        weights = weights / (weights.sum(dim=1, keepdim=True) + 1e-8)  # batch_size * (winlen - 1)
        weighted_context = (weights.unsqueeze(2) * context_vecs).sum(dim=1)  # batch_size * embedding

        return weighted_context
    
    def getEmbedding(self, id):
        return self.embedding_weights[id]

In [13]:
def preprocessing(text):
    _ = re.findall(r"[A-Za-z]+", text)
    words = []
    for word in _:
        words.append(word.lower())
    word2id = {w : i for i, w in enumerate(set(words))}
    id2word = {i : w for _, (w, i) in enumerate(word2id.items())}
    return words, word2id, id2word

def generateData(words, word2id, winlen): # winlen must be odd
    vocab_size = len(word2id)
    word_size = len(words)
    batch_size = word_size - winlen + 1
    context_train = np.zeros((batch_size, winlen - 1, vocab_size))
    center_train = np.zeros((batch_size, vocab_size))
    for _ in range(winlen // 2, word_size - winlen // 2):
        fr = _ - winlen // 2
        center_train[fr][word2id[words[_]]] = 1
        for __ in range(_ - winlen // 2, _):
            context_train[fr][__ - (_ - winlen // 2)][word2id[words[__]]] = 1
        for __ in range(_ + 1, _ + winlen // 2 + 1):
            context_train[fr][__ - (_ - winlen // 2) - 1][word2id[words[__]]] = 1
    return torch.tensor(context_train), torch.tensor(center_train), vocab_size, word_size
        
with open("input.txt", 'r') as f:
    text = f.read()

words, word2id, id2word = preprocessing(text)
context_train, center_train, vocab_size, word_size = generateData(words, word2id, 5)
print(context_train[0])

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [None]:
def train(context_train, center_train, embedding_dim):
    model = GaussianKernelEmbedding(vocab_size, embedding_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.MSELoss()
    criterion1 = nn.CosineEmbeddingLoss()
    num_epoches = 5000
    for epoch in range(num_epoches):
        model.train()
        optimizer.zero_grad()
        output = model(context_train, center_train)
        target = torch.matmul(center_train, model.embedding_weights.detach())
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")
    
    return model

In [None]:
model = train(context_train, center_train, 5)

In [22]:
word2embed = {}
for (word, id) in word2id.items():
    embedding = model.getEmbedding(id).detach().numpy()
    print(word, id, embedding)
    word2embed[word] = embedding

breeze 0 [-14.40703145   8.73204071  12.36670167 -16.03523951 -16.79188714]
tall 1 [-16.97486237   3.38100845   3.27222886 -25.08567909  -8.4387105 ]
sang 2 [ 0.33129743 -1.47820033  0.57870846  0.19120614 -1.19452783]
vendors 3 [ 30.90371273 -38.63527094  26.80847221   0.91745414  10.19197026]
fresh 4 [ 30.42413657 -34.08303314  30.05692052 -13.29498455  13.90778108]
read 5 [ -8.39768939   8.73543871  17.02046513 -13.85261675 -11.54029867]
in 6 [-17.10278034  20.0131912   22.2539325   12.13038196 -22.94085188]
nearby 7 [ -1.56354254   2.1647209   18.64327164 -11.55797691  -4.48906593]
gentle 8 [-16.59218113   6.76377744   9.06480991 -22.47058502 -18.42071867]
park 9 [-12.98913973   5.82543449  10.50638427 -12.13077845 -14.73017129]
birds 10 [ 0.33129743 -1.47820033  0.57870846  0.19120614 -1.19452783]
warm 11 [ -1.56018485   2.16126245  18.63037589 -11.53853298  -4.48225302]
chatted 12 [ 0.6929349   0.50316795  0.90907312 -2.09579454  0.72955842]
shining 13 [-15.37864169  14.07971165 