In [1]:
import numpy as np
from collections import Counter
import random

np.random.seed(1)
random.seed(1)

# ---------- Toy corpus ----------
sentences = [
    "NLP is fun and exciting",
    "We are learning natural language processing",
    "Machine learning powers modern NLP applications",
    "Natural language processing is a fascinating field",
    "Deep learning improves NLP performance",
    "We enjoy exploring text mining techniques",
    "AI is transforming language understanding"
]
tokens = [w.lower() for s in sentences for w in s.split()]
vocab_counter = Counter(tokens)
vocab = list(vocab_counter.keys())
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}



In [2]:
# ---------- Generate (context_list, target) pairs ----------
window = 2
pairs = []
for s in sentences:
    words = s.lower().split()
    for i, target in enumerate(words):
        start = max(0, i - window)
        end   = min(len(words), i + window + 1)
        context_indices = [word2idx[words[j]] for j in range(start, end) if j != i]
        target_idx = word2idx[target]
        pairs.append((context_indices, target_idx))

print("Vocab size:", len(vocab), "training pairs:", len(pairs))



Vocab size: 29 training pairs: 40


In [3]:
# ---------- Negative sampling distribution ----------
counts = np.array([vocab_counter[idx2word[i]] for i in range(len(vocab))], dtype=np.float64)
probs = counts ** 0.75
probs /= probs.sum()

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

# ---------- Initialize embeddings ----------
V = len(vocab)
D = 50
lr = 0.05
K = 5  # negatives per positive

W_in  = (np.random.rand(V, D) - 0.5) / D
W_out = (np.random.rand(V, D) - 0.5) / D

# ---------- Training loop ----------
epochs = 5
for epoch in range(epochs):
    random.shuffle(pairs)
    loss_epoch = 0.0
    for context_indices, target_idx in pairs:
        # Average the context word embeddings
        v_context = np.mean(W_in[context_indices], axis=0)  # shape (D,)

        # Positive sample
        score_pos = sigmoid(np.dot(W_out[target_idx], v_context))
        loss_epoch += -np.log(score_pos + 1e-10)

        # Gradients for positive
        grad_u_t = (score_pos - 1.0) * v_context
        grad_v_context = (score_pos - 1.0) * W_out[target_idx]
        W_out[target_idx] -= lr * grad_u_t

        # Negative samples
        neg_samples = np.random.choice(V, size=K, p=probs)
        for neg in neg_samples:
            score_neg = sigmoid(np.dot(W_out[neg], v_context))
            loss_epoch += -np.log(1.0 - score_neg + 1e-10)
            grad_u_neg = score_neg * v_context
            grad_v_context += score_neg * W_out[neg]
            W_out[neg] -= lr * grad_u_neg

        # Update each context word
        for c_idx in context_indices:
            W_in[c_idx] -= lr * (grad_v_context / len(context_indices))

    print(f"Epoch {epoch+1}/{epochs}, loss ~ {loss_epoch:.3f}")



Epoch 1/5, loss ~ 166.354
Epoch 2/5, loss ~ 166.350
Epoch 3/5, loss ~ 166.348
Epoch 4/5, loss ~ 166.339
Epoch 5/5, loss ~ 166.339


In [4]:
# ---------- Nearest neighbors ----------
def cosine_sim_matrix(mat):
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    normed = mat / (norms + 1e-10)
    return np.dot(normed, normed.T)

sim = cosine_sim_matrix(W_in)

def topk(word, k=5):
    if word not in word2idx:
        return []
    i = word2idx[word]
    scores = sim[i]
    top = np.argsort(-scores)
    return [(idx2word[t], float(scores[t])) for t in top[1:k+1]]

# Examples
for w in ["nlp", "learning", "language", "deep"]:
    print("\nNearest to", w, "->", topk(w, k=5))



Nearest to nlp -> [('powers', 0.3482710520408113), ('understanding', 0.29657599526704337), ('are', 0.28750537555905237), ('improves', 0.2865388272223649), ('techniques', 0.27809960584268634)]

Nearest to learning -> [('is', 0.4903240842818587), ('fun', 0.40522353027210717), ('powers', 0.347750950763279), ('exciting', 0.31798843211291145), ('language', 0.31298151894059356)]

Nearest to language -> [('learning', 0.31298151894059356), ('natural', 0.2803305050258882), ('we', 0.25962044106488374), ('is', 0.2531892627811632), ('transforming', 0.227953075888446)]

Nearest to deep -> [('understanding', 0.24393558308365818), ('learning', 0.1600076413356034), ('and', 0.13360308293215115), ('is', 0.10950010421780249), ('improves', 0.0991060483339744)]
