In [27]:
import numpy as np

# Define a small corpus
corpus = [
    "travel is good for life",
    "Machine learning is great",
    "I love coding in Python",
    "Python is great for data science",
    "Data science and machine learning"
]

# Preprocessing: Tokenizing and building a vocabulary
def preprocess(corpus):
    sentences = [sentence.lower().split() for sentence in corpus]
    vocabulary = sorted(set(word for sentence in sentences for word in sentence))  # Sorted for consistency
    word2idx = {word: idx for idx, word in enumerate(vocabulary)}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return sentences, word2idx, idx2word

sentences, word2idx, idx2word = preprocess(corpus)
vocab_size = len(word2idx)

# Function to generate training data (Skip-Gram pairs)
def generate_training_data(sentences, word2idx, window_size=2):
    training_pairs = []
    for sentence in sentences:
        indices = [word2idx[word] for word in sentence]
        for center_pos, center_word in enumerate(indices):
            # Define context window range
            start = max(center_pos - window_size, 0)
            end = min(center_pos + window_size + 1, len(indices))
            
            for context_pos in range(start, end):
                if context_pos != center_pos:
                    training_pairs.append((center_word, indices[context_pos]))  # (center, context)
    
    return training_pairs

# Generate training data
window_size = 2
training_data = generate_training_data(sentences, word2idx, window_size)

len(training_data)

70

In [28]:
import numpy as np

class SimpleSkipGram:
    def __init__(self, vocab_size, embedding_dim, learning_rate=0.05):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate

        # Randomly initialize embeddings
        self.U = np.random.randn(vocab_size, embedding_dim) * 0.01  # Context matrix
        self.V = np.random.randn(vocab_size, embedding_dim) * 0.01  # Center matrix

    def softmax(self, x):
        """Compute softmax row-wise with numerical stability."""
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Stability trick
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def compute_loss_and_gradients(self, center_word, context_word):
        """
        Compute gradients for a single (center, context) word pair.
        center_word: index of center word
        context_word: index of context word
        """
        # Compute scores (dot product between all word pairs)
        scores = self.U @ self.V.T  # Shape: (vocab_size, vocab_size)
        P = self.softmax(scores)  # Softmax probabilities

        # Compute loss
        loss = -np.log(P[context_word, center_word])  # Negative log-likelihood

        # Compute gradients
        dV = np.zeros_like(self.V)
        dU = np.zeros_like(self.U)

        # Update center word vector gradient
        dV[center_word] = -self.U[context_word] + np.sum(P[:, center_word, np.newaxis] * self.U, axis=0)
        
        # Update context word vector gradient
        dU[context_word] = -self.V[center_word] + np.sum(P[context_word, :, np.newaxis] * self.V, axis=0)

        return loss, dU, dV

    def train(self, dataset, epochs=500):
        """
        Train model on word pairs (center, context).
        dataset: List of (center_word_index, context_word_index) pairs.
        """
        for epoch in range(epochs):
            total_loss = 0
            dU_total = np.zeros_like(self.U)
            dV_total = np.zeros_like(self.V)

            for center, context in dataset:
                loss, dU, dV = self.compute_loss_and_gradients(center, context)
                total_loss += loss
                dU_total += dU
                dV_total += dV

            # Gradient descent updates
            self.U -= self.learning_rate * dU_total
            self.V -= self.learning_rate * dV_total

            if epoch % 500 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    def get_word_vectors(self):
        """Return learned word embeddings (average of U and V)."""
        return (self.U + self.V) / 2


embedding_dim = 10
# Initialize and train model
model = SimpleSkipGram(vocab_size, embedding_dim, learning_rate=0.1)
model.train(training_data, epochs=1000)

# Get trained word embeddings
word_vectors = model.get_word_vectors()
print("Final Word Embeddings:\n", word_vectors)

Epoch 0, Loss: 194.0781
Epoch 500, Loss: 139.3015
Final Word Embeddings:
 [[-3.70250586  3.04311431 -2.87079577  0.93378752 -3.37710457  0.30585486
  -7.88808373 -3.40512662  9.34577285 -0.04867573]
 [-1.94157789  4.08769833 -3.1746407   1.27255238 -7.16394152 -0.03277311
  -8.19499915 -1.41634568  7.88671122 -0.35021732]
 [-2.37052118  3.86808067 -2.02327506  0.83384659 -4.98637613  0.90389342
  -8.15112657 -2.80040922  8.62704307 -0.63626952]
 [-3.549503    4.81954789 -2.52086693  0.59116087 -5.80785836  0.28978125
  -7.9971631  -2.88677258  9.42812436 -0.40793367]
 [-3.1768224   4.67606979 -1.91007481  0.44906666 -5.83989496  0.62080776
  -7.58267682 -3.0241386   9.58930094 -0.35760592]
 [-3.21980031  4.64678263 -2.08728947  0.49263938 -5.87115679  0.57630485
  -7.73250517 -2.46385474  9.88980851 -0.58735502]
 [-0.61691286  3.41310629 -3.7676008   0.79689075 -8.11717915  1.14704039
  -8.45846825  0.20033015  8.54508355  0.77397014]
 [-0.71802307  4.73108789 -3.07392455  1.5254088  -

In [30]:
def get_embedding(word):
    """
    Retrieves the learned embedding for a given word.
    """
    return word_vectors[word2idx[word]]

def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two word vectors.
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Step 7: Test Word Embeddings
print("\nWord Embedding Similarity Results:")
print(f"Similarity between 'machine' and 'python': {cosine_similarity(get_embedding('machine'), get_embedding('learning')):.4f}")
print(f"Similarity between 'travel' and 'learning': {cosine_similarity(get_embedding('travel'), get_embedding('python')):.4f}")
print(f"Word embedding for 'data': {get_embedding('data')}")


Word Embedding Similarity Results:
Similarity between 'machine' and 'python': 0.9994
Similarity between 'travel' and 'learning': 0.9691
Word embedding for 'data': [-2.37052118  3.86808067 -2.02327506  0.83384659 -4.98637613  0.90389342
 -8.15112657 -2.80040922  8.62704307 -0.63626952]


In [31]:
cosine_sim_matrix = lambda embeddings: (embeddings @ embeddings.T) / (np.linalg.norm(embeddings, axis=1, keepdims=True) @ np.linalg.norm(embeddings, axis=1, keepdims=True).T)
cos_sim_matrix = cosine_sim_matrix(model.get_word_vectors())  # Compute cosine similarity matrix
print(cos_sim_matrix)  # Shape: (vocab_size, vocab_size)


[[1.         0.94042464 0.98171998 0.98069883 0.97695953 0.97645927
  0.89392703 0.91509761 0.9749875  0.98571634 0.97246832 0.91492167
  0.98295956 0.95731231 0.98929305 0.95341077]
 [0.94042464 1.         0.97673518 0.97795519 0.97303567 0.97622313
  0.98055748 0.99126807 0.97989985 0.96992354 0.95109866 0.9924161
  0.97054779 0.98850313 0.96834845 0.94502817]
 [0.98171998 0.97673518 1.         0.99365536 0.99374614 0.99355454
  0.94165167 0.9656844  0.99317391 0.99175263 0.9817941  0.95735299
  0.99124326 0.98683114 0.99696885 0.96209684]
 [0.98069883 0.97795519 0.99365536 1.         0.99828126 0.99807437
  0.93676608 0.96174042 0.99724244 0.99874707 0.97417839 0.95787639
  0.99786251 0.99010397 0.99171664 0.97867969]
 [0.97695953 0.97303567 0.99374614 0.99828126 1.         0.9989772
  0.93462841 0.96069924 0.99830544 0.99757484 0.97339091 0.95590927
  0.99816774 0.98855815 0.99219863 0.97769764]
 [0.97645927 0.97622313 0.99355454 0.99807437 0.9989772  1.
  0.94131453 0.96483019 0.9

In [36]:
import numpy as np

class BatchSkipGram:
    def __init__(self, vocab_size, embedding_dim, learning_rate=0.05, batch_size=4):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.batch_size = batch_size

        # Initialize U (context matrix) and V (center matrix) randomly
        self.U = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.V = np.random.randn(vocab_size, embedding_dim) * 0.01

    def softmax(self, x):
        """Compute softmax row-wise with numerical stability."""
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Stability trick
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def compute_loss_and_gradients(self, batch):
        """
        Compute loss and gradients for a batch of (center, context) pairs.
        batch: List of (center_word_index, context_word_index) pairs.
        """
        batch_size = len(batch)
        centers, contexts = zip(*batch)  # Extract word indices
        centers = np.array(centers)
        contexts = np.array(contexts)

        # Compute scores (dot product between context and all center words)
        scores = np.dot(self.U[contexts], self.V.T)  # Shape: (batch_size, vocab_size)

        # Compute softmax probabilities
        P = self.softmax(scores)  # Shape: (batch_size, vocab_size)

        # Compute loss
        loss = -np.sum(np.log(P[np.arange(batch_size), centers])) / batch_size

        # Compute gradients
        dU = np.zeros_like(self.U)
        dV = np.zeros_like(self.V)

        # One-hot encode centers for batch (for efficient subtraction)
        center_one_hot = np.zeros((batch_size, self.vocab_size))
        center_one_hot[np.arange(batch_size), centers] = 1

        # Compute difference between actual and predicted probabilities
        diff = P - center_one_hot  # Shape: (batch_size, vocab_size)

        # Update context word gradients
        dU[contexts] = np.dot(diff, self.V)  # Shape correction

        # Update center word gradients
        dV[centers] -= self.U[contexts]  # Pull correct pairs together
        for i in range(batch_size):  # Accumulate over batch
            dV += P[i, :, np.newaxis] * self.U[contexts[i]]

        # Normalize gradients by batch size
        dU /= batch_size
        dV /= batch_size

        return loss, dU, dV

    def train(self, dataset, epochs=1000):
        """
        Train model using batch gradient descent.
        dataset: List of (center_word_index, context_word_index) pairs.
        """
        for epoch in range(epochs):
            np.random.shuffle(dataset)  # Shuffle dataset each epoch
            total_loss = 0

            for i in range(0, len(dataset), self.batch_size):
                batch = dataset[i:i+self.batch_size]  # Select batch
                loss, dU, dV = self.compute_loss_and_gradients(batch)

                # Update matrices
                self.U -= self.learning_rate * dU
                self.V -= self.learning_rate * dV

                total_loss += loss

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    def get_word_vectors(self):
        """Return learned word embeddings (average of U and V)."""
        return (self.U + self.V) / 2


In [37]:
embedding_dim = 10
batch_size = 2  # Process 2 word pairs at a time

# Initialize and train model
model = BatchSkipGram(vocab_size, embedding_dim, learning_rate=0.1, batch_size=batch_size)
model.train(training_data, epochs=500)

# Get trained word embeddings
word_vectors = model.get_word_vectors()
print("Final Word Embeddings:\n", word_vectors)

Epoch 0, Loss: 97.0407
Epoch 100, Loss: 57.8623
Epoch 200, Loss: 57.9518
Epoch 300, Loss: 58.0282
Epoch 400, Loss: 58.1466
Final Word Embeddings:
 [[ 2.39144987e-01 -7.01049044e-01  2.68894793e-01  1.10664816e+00
  -2.25121295e-01 -6.11209245e-01 -6.80401612e-01  5.53725523e-01
   3.78910217e-01  1.29782901e+00]
 [ 1.04369246e+00 -7.62884561e-01 -1.17013089e+00  1.50808783e+00
   1.70200350e+00  7.36227092e-03 -3.37419575e-01 -1.90422540e-01
  -1.04063607e+00  7.46799017e-01]
 [ 8.45473856e-01  3.37794200e-01  3.99465129e-01  1.05383220e+00
  -1.13618163e-01 -2.45329733e-01  3.05108322e-01  2.12893817e-01
   4.13836991e-01  8.77235229e-01]
 [ 5.75995829e-01  3.26653364e-01 -1.80005394e-01  3.52405767e-01
  -1.63688492e-01  4.90782250e-01  2.36014439e-01 -2.02098546e-01
   4.50332346e-01  1.45290984e-01]
 [ 4.95682529e-01  6.09648552e-01 -1.36815463e+00  2.25835374e-01
   1.35363882e-01  1.11608133e+00 -1.26510591e-01 -6.64199688e-02
   4.33796937e-01  9.06313988e-02]
 [-3.35533252e-03 

In [38]:
def get_embedding(word):
    """
    Retrieves the learned embedding for a given word.
    """
    return word_vectors[word2idx[word]]

def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two word vectors.
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Step 7: Test Word Embeddings
print("\nWord Embedding Similarity Results:")
print(f"Similarity between 'machine' and 'python': {cosine_similarity(get_embedding('machine'), get_embedding('learning')):.4f}")
print(f"Similarity between 'travel' and 'learning': {cosine_similarity(get_embedding('travel'), get_embedding('python')):.4f}")
print(f"Word embedding for 'data': {get_embedding('data')}")


Word Embedding Similarity Results:
Similarity between 'machine' and 'python': 0.8275
Similarity between 'travel' and 'learning': 0.2994
Word embedding for 'data': [ 0.84547386  0.3377942   0.39946513  1.0538322  -0.11361816 -0.24532973
  0.30510832  0.21289382  0.41383699  0.87723523]
