#The preprocessing steps, forward pass, and output probabilities.


In [3]:
# implementation of the Word2Vec Skip-Gram model for the given corpus "the weather is windy" in Python using NumPy.

import numpy as np
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder

# Corpus and preprocessing
corpus = "the weather is windy"
words = corpus.split()

# Create vocabulary and word index mapping
vocab = list(set(words))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

# One-hot encoding
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(np.array(vocab).reshape(-1, 1))

# Function to generate training pairs (skip-gram model)
def generate_training_data(corpus, window_size):
    words = corpus.split()
    training_pairs = []
    for i, word in enumerate(words):
        for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
            if i != j:
                training_pairs.append((word, words[j]))
    return training_pairs

# Generate training data
window_size = 1
training_pairs = generate_training_data(corpus, window_size)
print("Training pairs:", training_pairs)

# Initialize parameters
vocab_size = len(vocab)
embedding_dim = 2
W = np.random.rand(vocab_size, embedding_dim)
W_prime = np.random.rand(embedding_dim, vocab_size)

# Forward pass function
def forward(input_word):
    x = onehot_encoder.transform([[input_word]])
    h = np.dot(W.T, x.T).reshape(-1)
    u = np.dot(W_prime.T, h)
    y_pred = np.exp(u) / np.sum(np.exp(u))
    return y_pred

# Forward pass for a specific pair (weather, the)
input_word = "weather"
target_word = "the"

y_pred = forward(input_word)
target_idx = word_to_idx[target_word]

print("Input word:", input_word)
print("Target word:", target_word)
print("Predicted probabilities:", y_pred)
print("Target index:", target_idx)
print("Probability of target word being 'the':", y_pred[target_idx])

# Run forward pass for all training pairs and print results
for input_word, target_word in training_pairs:
    y_pred = forward(input_word)
    target_idx = word_to_idx[target_word]
    print(f"Input word: {input_word}, Target word: {target_word}, Probability: {y_pred[target_idx]:.4f}")

Training pairs: [('the', 'weather'), ('weather', 'the'), ('weather', 'is'), ('is', 'weather'), ('is', 'windy'), ('windy', 'is')]
Input word: weather
Target word: the
Predicted probabilities: [0.32459592 0.24121418 0.15888091 0.27530899]
Target index: 0
Probability of target word being 'the': 0.32459592098276485
Input word: the, Target word: weather, Probability: 0.2184
Input word: weather, Target word: the, Probability: 0.3246
Input word: weather, Target word: is, Probability: 0.1589
Input word: is, Target word: weather, Probability: 0.2265
Input word: is, Target word: windy, Probability: 0.2894
Input word: windy, Target word: is, Probability: 0.2358




### Explanation of the Code
1. **Corpus and Preprocessing**: The corpus is split into words. A vocabulary and mapping from words to indices are created.
2. **One-Hot Encoding**: The words are one-hot encoded using `OneHotEncoder` from `sklearn`.
3. **Generate Training Data**: A function generates training pairs using a context window size.
4. **Initialize Parameters**: Random weights for `W` (input-to-hidden) and `W'` (hidden-to-output) matrices are initialized.
5. **Forward Pass Function**: The forward pass calculates the hidden layer activations `h`, output layer pre-activations `u`, and softmax probabilities `y_pred`.
6. **Forward Pass for Specific Pair**: The code performs a forward pass for the pair ("weather", "the") and prints the predicted probabilities.
7. **Loop Over Training Pairs**: The forward pass is run for all training pairs to print the predicted probabilities for each target word.

This code provides a basic implementation of the Skip-Gram model and demonstrates the forward propagation step for the given corpus.

# Forward Pass + Backward Pass

In [5]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Corpus and preprocessing
corpus = "the weather is windy"
words = corpus.split()

# Create vocabulary and word index mapping
vocab = list(set(words))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

# One-hot encoding
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoder.fit(np.array(vocab).reshape(-1, 1))

# Function to generate training pairs (skip-gram model)
def generate_training_data(corpus, window_size):
    words = corpus.split()
    training_pairs = []
    for i, word in enumerate(words):
        for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
            if i != j:
                training_pairs.append((word, words[j]))
    return training_pairs

# Generate training data
window_size = 1
training_pairs = generate_training_data(corpus, window_size)
print("Training pairs:", training_pairs)

# Initialize parameters
vocab_size = len(vocab)
embedding_dim = 2
W = np.random.randn(vocab_size, embedding_dim)
W_prime = np.random.randn(embedding_dim, vocab_size)
learning_rate = 0.01

# Forward and backward pass functions
def forward(input_word):
    x = onehot_encoder.transform([[input_word]]).reshape(-1)
    h = np.dot(W.T, x)
    u = np.dot(W_prime.T, h)
    y_pred = np.exp(u) / np.sum(np.exp(u))
    return x, h, u, y_pred

def backward(x, h, y_pred, target_word_idx):
    # One-hot encoding of the target word
    y_true = np.zeros(vocab_size)
    y_true[target_word_idx] = 1

    # Calculate error
    e = y_pred - y_true

    # Gradients
    dW_prime = np.outer(h, e)
    dh = np.dot(W_prime, e)
    dW = np.outer(x, dh)

    return dW, dW_prime

# Training the model
for epoch in range(1000):
    for input_word, target_word in training_pairs:
        x, h, u, y_pred = forward(input_word)
        target_word_idx = word_to_idx[target_word]
        dW, dW_prime = backward(x, h, y_pred, target_word_idx)

        # Update weights
        W -= learning_rate * dW
        W_prime -= learning_rate * dW_prime

    if epoch % 100 == 0:
        print(f"Epoch {epoch} completed")

# Check the trained embeddings
for word in vocab:
    idx = word_to_idx[word]
    print(f"Word: {word}, Embedding: {W[idx]}")

# Run forward pass for all training pairs and print results
for input_word, target_word in training_pairs:
    _, _, _, y_pred = forward(input_word)
    target_idx = word_to_idx[target_word]
    print(f"Input word: {input_word}, Target word: {target_word}, Probability: {y_pred[target_idx]:.4f}")


Training pairs: [('the', 'weather'), ('weather', 'the'), ('weather', 'is'), ('is', 'weather'), ('is', 'windy'), ('windy', 'is')]
Epoch 0 completed
Epoch 100 completed
Epoch 200 completed
Epoch 300 completed
Epoch 400 completed
Epoch 500 completed
Epoch 600 completed
Epoch 700 completed
Epoch 800 completed
Epoch 900 completed
Word: the, Embedding: [-0.59357182 -0.75520178]
Word: weather, Embedding: [-2.19560021 -1.1263265 ]
Word: is, Embedding: [0.88790898 1.30578802]
Word: windy, Embedding: [ 2.20447211 -0.01149224]
Input word: the, Target word: weather, Probability: 0.8956
Input word: weather, Target word: the, Probability: 0.4860
Input word: weather, Target word: is, Probability: 0.5087
Input word: is, Target word: weather, Probability: 0.6151
Input word: is, Target word: windy, Probability: 0.3522
Input word: windy, Target word: is, Probability: 0.9825


### Explanation of the Code

1. **Corpus and Preprocessing**: The corpus is split into words. A vocabulary and mappings from words to indices are created.
2. **One-Hot Encoding**: The words are one-hot encoded using `OneHotEncoder` from `sklearn`.
3. **Generate Training Data**: A function generates training pairs using a context window size.
4. **Initialize Parameters**: Random weights for `W` (input-to-hidden) and `W'` (hidden-to-output) matrices are initialized.
5. **Forward Pass Function**: The forward pass calculates the hidden layer activations `h`, output layer pre-activations `u`, and softmax probabilities `y_pred`.
6. **Backward Pass Function**: The backward pass calculates the gradients of the loss with respect to the weights `W` and `W'`.
7. **Training Loop**: The model is trained for a number of epochs using stochastic gradient descent. The weights `W` and `W'` are updated using the calculated gradients.
8. **Check Trained Embeddings**: The trained embeddings for each word in the vocabulary are printed.
9. **Run Forward Pass for All Training Pairs**: The forward pass is run for all training pairs to print the predicted probabilities for each target word.

This code provides a basic implementation of training a Skip-Gram model using forward and backward propagation steps.