In [1]:
import numpy as np

corpus = [
    "the cat sat on the mat",
    "the dog sat on the mat",
    "the cat chased the dog",
    "the dog barked at the cat",
    "the dog barked at the cat",
    "the cat meowed back at the dog"
]



# Tokenize

In [2]:
tokens = [sentence.split() for sentence in corpus]
tokens

[['the', 'cat', 'sat', 'on', 'the', 'mat'],
 ['the', 'dog', 'sat', 'on', 'the', 'mat'],
 ['the', 'cat', 'chased', 'the', 'dog'],
 ['the', 'dog', 'barked', 'at', 'the', 'cat'],
 ['the', 'dog', 'barked', 'at', 'the', 'cat'],
 ['the', 'cat', 'meowed', 'back', 'at', 'the', 'dog']]

In [3]:
vocab = sorted(set( word for sentence in tokens for word in sentence))
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i:word for word, i in word_to_ix.items()}
vocab_size = len(vocab)

print(vocab)

['at', 'back', 'barked', 'cat', 'chased', 'dog', 'mat', 'meowed', 'on', 'sat', 'the']


# Generate target context pairs

In [4]:
def generate_training_data(tokens, window_size=2):
    training_data = []
    for sentence in tokens:
        for i, word in enumerate(sentence):
            target = word_to_ix[word]
            context_range = range(max(0,i - window_size), min(len(sentence), i + window_size + 1))
            for j in context_range:
                if i == j:
                    continue
                training_data.append([target,word_to_ix[sentence[j]]])
    return np.array(training_data)


training_data = generate_training_data(tokens, window_size=2)
print("Sample Data: ", training_data[:5])

Sample Data:  [[10  3]
 [10  9]
 [ 3 10]
 [ 3  9]
 [ 3  8]]


# Define a model (2-layer NN)

when we train a word2vec skip gram, we are training here a very small 2 layer neural network

input (one hot word) -> W1 -> hidden layer -> W2 -> output (soft max probabilities)

input layer: one hot vector of size vocab_size say 5 -> ```[0, 0, 1, 0, 0]```
hidden layer: (embedding layer)

```
hidden = W1.T @ input
```

this gives 10 dimensional embedding (since embedding_dim is 10)

Output layer: 
```
scores = W2.T @ hidden
```

now the softmax step
np.exp(x) means applying the exponential function $e^x$ to each element of `x`
- *e* is a mathematicalconstant~ 2.71828
- It shows up everywhere in growth, decay, and probabilities.

```
x = np.array([1, 2, 3])
np.exp(x)
```

gives `[2.718, 7.389, 20.085]`

why x - np.max(x) - For numerical stability to avoid huge numbers  that could cause overflow
It doesn’t change the relative probabilities, only scales them safely.

-- Normalization ---

```
e_x = np.exp(x - np.max(x))
softmax = e_x / e_x.sum(axis=0)

```
This divides each exponential by the sum of all exponentials, so what happens is:
subtraction of max keeps from overflowing 

we use exponential function e raise to x  because it has special mathematical properties that make it perfect for converting any real numbers into positive smooth and proportionally scaled values that behave well in optimizations. using a different base like 2 raise to x would only scale the ouptuts - it wouldnt chnage the relative probabilities so e^x is chosen for mathematical convenience and smoothness in gradients.



In [5]:
embedding_dim = 10
W1 = np.random.rand(vocab_size, embedding_dim)
W2 = np.random.rand(embedding_dim, vocab_size)

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis = 0)


Training Loop

what's inside every training loop?
- forward pass - np.dot, np.matmul, activations like softmax, sigmoid - compute predictions
- loss calculations - np.sum, np.log etc - compare predictions vs target
- backward pass (gradient) - np.dot, np.outer, chain rule - compute how much each weight contributed to error
- weight update W -= lr * dW - Adust weights to reduce loss
- repeat (epochs) - loop over data




x = one-hot (shape = vocab_size × 1)

W1 = (vocab_size × embedding_dim)

W1.T @ x = (embedding_dim × 1) → gives hidden representation (embedding)

W2.T @ h = (vocab_size × 1) → gives scores for all context words

softmax turns scores into probabilities

In [6]:
learning_rate = 0.01
epochs = 5000

for epoch in range(epochs):
    loss = 0
    for target, context in training_data:
        # One-hot encode target
        x = np.zeros(vocab_size)
        x[target] = 1

        # Forward pass
        h = np.dot(W1.T, x)               # hidden layer (embedding)
        u = np.dot(W2.T, h)               # output layer
        y_pred = softmax(u)               # predicted prob distribution

        # True label
        y_true = np.zeros(vocab_size)
        y_true[context] = 1

        # Loss (cross-entropy)
        loss += -np.sum(y_true * np.log(y_pred + 1e-9))

        # Backprop
        e = y_pred - y_true
        dW2 = np.outer(h, e)
        dW1 = np.outer(x, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

Epoch 0, Loss: 258.1971
Epoch 1000, Loss: 181.4201
Epoch 2000, Loss: 181.4935
Epoch 3000, Loss: 181.5862
Epoch 4000, Loss: 181.6640


In [7]:
def get_embedding(word):
    return W1[word_to_ix[word]]

print("Embedding for 'cat':", get_embedding("cat"))


Embedding for 'cat': [ 0.26305422  0.52859225  0.23922563 -0.71310849 -0.1423374  -1.1915748
  1.35098558  1.04178593 -0.55792743  0.11334474]


In [8]:
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def most_similar(word, top_n=3):
    target_vec = get_embedding(word)
    sims = []
    for other in vocab:
        if other != word:
            sim = cosine_similarity(target_vec, get_embedding(other))
            sims.append((other, sim))
    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:top_n]

print("Most similar to 'cat':", most_similar("cat"))
print("Most similar to 'dog':", most_similar("dog"))


Most similar to 'cat': [('dog', np.float64(0.3713103907085415)), ('back', np.float64(0.36447181396005784)), ('meowed', np.float64(0.3228419668222796))]
Most similar to 'dog': [('cat', np.float64(0.3713103907085415)), ('mat', np.float64(0.22315184031014162)), ('barked', np.float64(0.18621481145375948))]
