In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# 1. Prepare corpus
corpus = "the cat sat on the mat".split()
vocab = list(set(corpus))
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [3]:
# 2. Generate skip-gram pairs
def make_pairs(corpus, window_size=2):
    pairs = []
    for idx, word in enumerate(corpus):
        for j in range(max(0, idx-window_size), min(len(corpus), idx+window_size+1)):
            if j != idx:
                pairs.append((word, corpus[j]))
    return pairs

In [4]:
pairs = make_pairs(corpus)

In [5]:
pairs

[('the', 'cat'),
 ('the', 'sat'),
 ('cat', 'the'),
 ('cat', 'sat'),
 ('cat', 'on'),
 ('sat', 'the'),
 ('sat', 'cat'),
 ('sat', 'on'),
 ('sat', 'the'),
 ('on', 'cat'),
 ('on', 'sat'),
 ('on', 'the'),
 ('on', 'mat'),
 ('the', 'sat'),
 ('the', 'on'),
 ('the', 'mat'),
 ('mat', 'on'),
 ('mat', 'the')]

In [10]:
# 3. Define Skip-Gram model
class SkipGram(nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super().__init__()
        self.embeddings=nn.Embedding(vocab_size,embed_dim)
        self.output = nn.Linear(embed_dim,vocab_size)

    def forward(self,target_word):
        embed= self.embeddings(target_word)
        out = self.output(embed)
        return out

In [11]:
# 4. Training
vocab_size = len(vocab)
embed_dim = 10
model = SkipGram(vocab_size, embed_dim)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(100):
    total_loss = 0
    for target, context in pairs:
        target_idx = torch.tensor([word_to_ix[target]])
        context_idx = torch.tensor([word_to_ix[context]])

        # Forward
        logits = model(target_idx)
        loss = loss_fn(logits, context_idx)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

Epoch 0, Loss: 34.0285
Epoch 20, Loss: 22.0915
Epoch 40, Loss: 21.7102
Epoch 60, Loss: 21.6059
Epoch 80, Loss: 21.5587


In [12]:
# 5. Inspect embeddings
for word in vocab:
    print(word, model.embeddings.weight[word_to_ix[word]].detach())

on tensor([-0.7967, -0.5410,  0.0294,  2.1963, -1.1211, -0.7904,  0.1418, -0.7822,
        -0.1583, -1.1755])
sat tensor([ 2.7197,  0.3240,  0.3842, -0.4717,  0.1753, -0.8582,  1.2868,  1.5115,
        -1.5871, -0.6233])
cat tensor([ 0.4154,  2.1928, -0.1114,  0.5088, -1.0117,  1.4757,  1.3855, -0.7298,
        -0.4603,  1.4506])
the tensor([ 1.2610, -2.0358, -0.0401,  0.2600,  0.8264, -0.5929, -1.1368, -1.7898,
        -0.3159,  2.3082])
mat tensor([ 2.1416,  0.2538, -3.6436, -1.3006,  1.9636,  0.7277,  0.3724, -1.3204,
        -0.4389, -1.2747])


In [None]:
! pip install gensim

In [None]:
!pip install staticvectors

In [22]:
from staticvectors import StaticVectors

model = StaticVectors("neuml/glove-6B")
model.embeddings(["word"])

config.json:   0%|          | 0.00/149 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/480M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/7.69M [00:00<?, ?B/s]

array([[-7.42449462e-02, -7.00803548e-02, -3.15346085e-02,
        -2.16475874e-02,  7.50281801e-03, -3.66014689e-02,
        -6.29846826e-02,  5.62374946e-03, -1.86802745e-02,
        -2.33537942e-01, -3.30406912e-02,  3.27370614e-02,
        -7.54853562e-02,  4.19857614e-02,  5.11965081e-02,
        -7.39861699e-03, -8.52999184e-03,  6.55465797e-02,
         9.60564241e-03,  1.85284577e-02, -2.62141451e-02,
         4.81343101e-04, -8.62677395e-03,  2.92383898e-02,
        -3.07410229e-02, -3.01027074e-02,  3.11671440e-02,
        -6.06901906e-02,  7.56302625e-02,  4.12939638e-02,
        -2.10472252e-02, -2.35159602e-03, -4.82861288e-02,
         7.75469467e-02, -1.13447987e-01, -1.33385919e-02,
         1.19461976e-02, -1.33699909e-01, -7.50609562e-02,
         2.79082749e-02, -1.63681861e-02,  4.47995365e-02,
        -6.26862282e-03,  3.47296447e-02,  2.90865730e-02,
        -7.52455518e-02, -3.74364574e-03,  1.15157655e-02,
        -6.09610416e-02,  6.13837093e-02,  7.42587522e-0

In [25]:
import numpy as np

v1 = model.embeddings(["cat"])[0]   # returns array inside a list
v2 = model.embeddings(["dog"])[0]

cosine = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
print(cosine)  # closer to 1 if similar

0.6816748


In [26]:

v3 = model.embeddings(["cat"])[0]   # returns array inside a list
v4 = model.embeddings(["car"])[0]

cosine = np.dot(v3, v4) / (np.linalg.norm(v3) * np.linalg.norm(v4))
print(cosine)  # closer to 1 if similar

0.14687802
