In [1]:
import numpy as np
from collections import Counter
from math import exp
import random

np.random.seed(1)
random.seed(1)

# ---------- Toy corpus ----------
sentences = [
    "NLP is fun and exciting",
    "We are learning natural language processing",
    "Machine learning powers modern NLP applications",
    "Natural language processing is a fascinating field",
    "Deep learning improves NLP performance",
    "We enjoy exploring text mining techniques",
    "AI is transforming language understanding"
]
tokens = [w.lower() for s in sentences for w in s.split()]
vocab_counter = Counter(tokens)
vocab = [w for w, c in vocab_counter.items()]
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}


In [2]:
# ---------- Generate (center, context) training pairs (skip-gram) ----------
window = 2
pairs = []
for s in sentences:
    words = s.lower().split()
    for i, center in enumerate(words):
        center_idx = word2idx[center]
        for j in range(max(0, i-window), min(len(words), i+window+1)):
            if j == i: 
                continue
            context = words[j]
            pairs.append((center_idx, word2idx[context]))

print("Vocab size:", len(vocab), "training pairs:", len(pairs))



Vocab size: 29 training pairs: 118


In [3]:
# ---------- Negative sampling distribution (unigram^0.75) ----------
counts = np.array([vocab_counter[idx2word[i]] for i in range(len(vocab))], dtype=np.float64)
probs = counts ** 0.75
probs = probs / probs.sum()

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

# ---------- Initialize embeddings ----------
V = len(vocab)
D = 50         # embedding dimension (small for demo)
lr = 0.05
K = 5          # negatives per positive

W_in = (np.random.rand(V, D) - 0.5) / D   # input embeddings (v)
W_out = (np.random.rand(V, D) - 0.5) / D  # output embeddings (u)

# ---------- Training loop ----------
epochs = 5
for epoch in range(epochs):
    random.shuffle(pairs)
    loss_epoch = 0.0
    for center_idx, context_idx in pairs:
        v_c = W_in[center_idx]          # D
        u_o = W_out[context_idx]        # D

        # positive score
        score_pos = sigmoid(np.dot(u_o, v_c))
        loss_epoch += -np.log(score_pos + 1e-10)

        # negative samples
        neg_samples = np.random.choice(V, size=K, p=probs)
        # allow negatives to possibly include the context word; in practice you filter it
        # compute gradients
        grad_v = (score_pos - 1.0) * u_o    # start with pos grad contribution
        grad_uo = (score_pos - 1.0) * v_c
        # update output embedding for positive
        W_out[context_idx] -= lr * grad_uo

        for neg in neg_samples:
            u_n = W_out[neg]
            score_neg = sigmoid(np.dot(u_n, v_c))
            loss_epoch += -np.log(1.0 - score_neg + 1e-10)
            grad_un = score_neg * v_c
            grad_v += score_neg * u_n
            W_out[neg] -= lr * grad_un

        # update center embedding after accumulating all contributions
        W_in[center_idx] -= lr * grad_v

    print(f"Epoch {epoch+1}/{epochs}, loss ~ {loss_epoch:.3f}")



Epoch 1/5, loss ~ 490.734
Epoch 2/5, loss ~ 490.699
Epoch 3/5, loss ~ 490.609
Epoch 4/5, loss ~ 490.414
Epoch 5/5, loss ~ 489.912


In [4]:
# ---------- Utility: nearest neighbors by cosine similarity ----------
def cosine_sim_matrix(mat):
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    normed = mat / (norms + 1e-10)
    return np.dot(normed, normed.T)

sim = cosine_sim_matrix(W_in)

def topk(word, k=5):
    if word not in word2idx:
        return []
    i = word2idx[word]
    scores = sim[i]
    top = np.argsort(-scores)
    return [(idx2word[t], float(scores[t])) for t in top[1:k+1]]

# Examples
for w in ["nlp", "learning", "language", "deep"]:
    print("\nNearest to", w, "->", topk(w, k=5))



Nearest to nlp -> [('learning', 0.8811321971428834), ('is', 0.864622500598743), ('powers', 0.843675401786075), ('language', 0.8383270172722006), ('text', 0.7946848200413338)]

Nearest to learning -> [('is', 0.9315716235518363), ('nlp', 0.8811321971428834), ('powers', 0.849972921736926), ('language', 0.8485659891260925), ('fun', 0.8311395996525063)]

Nearest to language -> [('is', 0.8656354530129817), ('learning', 0.8485659891260925), ('nlp', 0.8383270172722006), ('natural', 0.8232605197340233), ('a', 0.7699022142049373)]

Nearest to deep -> [('is', 0.5538959677290589), ('learning', 0.5339860678717774), ('understanding', 0.526072785198946), ('we', 0.48552356157918825), ('language', 0.4761385875987834)]
