## Imports

In [3]:
import re
import numpy as np
from collections import Counter

## Load Dataset

In [4]:
with open("Text.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(text[:300])

Word2vec is a shallow, two-layer neural network technique developed by Google that converts text into numerical vectors (word embeddings) to capture semantic relationships and meanings. It analyzes large text corpora to represent words in a continuous vector space where similar words are closer toge


## Preprocessing

In [5]:
# Preprocessing (Tokenization, Vocabulary, Encoding)

def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    tokens = text.split()
    return tokens

def build_vocab(tokens, min_freq=1, max_vocab_size=None):
    counter = Counter(tokens)
    items = [(w, c) for w, c in counter.items() if c >= min_freq]
    items.sort(key=lambda x: -x[1])

    if max_vocab_size is not None:
        items = items[:max_vocab_size]

    word2id = {w: i for i, (w, _) in enumerate(items)}
    id2word = {i: w for w, i in word2id.items()}
    return word2id, id2word

def encode(tokens, word2id):
    return [word2id[w] for w in tokens if w in word2id]



In [6]:
tokens = tokenize(text)
word2id, id2word = build_vocab(tokens)
encoded = encode(tokens, word2id)

print("Vocab size:", len(word2id))
print("First 20 tokens:", tokens[:20])
print("First 20 encoded:", encoded[:20])

Vocab size: 104
First 20 tokens: ['word2vec', 'is', 'a', 'shallow', 'two', 'layer', 'neural', 'network', 'technique', 'developed', 'by', 'google', 'that', 'converts', 'text', 'into', 'numerical', 'vectors', 'word', 'embeddings']
First 20 encoded: [4, 5, 1, 31, 32, 33, 34, 35, 36, 37, 38, 39, 12, 13, 6, 14, 40, 15, 2, 41]


## Generate Skip-Gram pairs

In [7]:
def generate_skipgram_pairs(encoded_tokens, window_size=2):
    pairs = []
    for i, center in enumerate(encoded_tokens):
        start = max(0, i - window_size)
        end = min(len(encoded_tokens), i + window_size + 1)
        for j in range(start, end):
            if i != j:
                context = encoded_tokens[j]
                pairs.append((center, context))
    return pairs

In [8]:
pairs = generate_skipgram_pairs(encoded, window_size=2)

print("Number of training pairs:", len(pairs))
print("First 10 pairs:", pairs[:10])

Number of training pairs: 614
First 10 pairs: [(4, 5), (4, 1), (5, 4), (5, 1), (5, 31), (1, 4), (1, 5), (1, 31), (1, 32), (31, 5)]


## Negative Sampling

In [9]:
def sample_negative_words(vocab_size, positive_word, K=5):
    neg_samples = []
    while len(neg_samples) < K:
        neg = np.random.randint(0, vocab_size)
        if neg != positive_word:
            neg_samples.append(neg)
    return neg_samples

In [10]:
print("Example negative samples:", sample_negative_words(len(word2id), positive_word=0, K=5))

Example negative samples: [75, 1, 26, 63, 79]


## Word2Vec Model (Numpy)

In [12]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


class Word2Vec:
    def __init__(self, vocab_size, embedding_dim=50, learning_rate=0.01):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lr = learning_rate

        # Input embeddings (center words)
        self.W_in = np.random.randn(vocab_size, embedding_dim) * 0.01

        # Output embeddings (context words)
        self.W_out = np.random.randn(vocab_size, embedding_dim) * 0.01

    def forward(self, center_id, context_id, neg_ids):
        v_c = self.W_in[center_id]           # center word vector
        u_o = self.W_out[context_id]         # positive context vector
        u_neg = self.W_out[neg_ids]          # negative samples vectors

        # Positive score
        pos_score = np.dot(v_c, u_o)
        pos_prob = sigmoid(pos_score)

        # Negative scores
        neg_scores = np.dot(u_neg, v_c)
        neg_prob = sigmoid(-neg_scores)

        # Loss = -log(sigmoid(v_c · u_o)) - sum(log(sigmoid(-v_c · u_neg)))
        loss = -np.log(pos_prob) - np.sum(np.log(neg_prob))

        return pos_prob, neg_prob, loss, v_c, u_o, u_neg

    def backward(self, center_id, context_id, neg_ids, pos_prob, neg_prob, v_c, u_o, u_neg):
        # Gradient for center word vector
        grad_v = (pos_prob - 1) * u_o + np.sum(neg_prob[:, None] * u_neg, axis=0)

        # Gradient for positive context word
        grad_u_o = (pos_prob - 1) * v_c

        # Gradient for negative samples
        grad_u_neg = neg_prob[:, None] * v_c

        # Update center word embedding
        self.W_in[center_id] -= self.lr * grad_v

        # Update positive context embedding
        self.W_out[context_id] -= self.lr * grad_u_o

        # Update negative sample embeddings
        for i, neg_id in enumerate(neg_ids):
            self.W_out[neg_id] -= self.lr * grad_u_neg[i]

    def train_step(self, center_id, context_id, neg_ids):
        pos_prob, neg_prob, loss, v_c, u_o, u_neg = self.forward(center_id, context_id, neg_ids)
        self.backward(center_id, context_id, neg_ids, pos_prob, neg_prob, v_c, u_o, u_neg)
        return loss

## Training loop

In [13]:
model = Word2Vec(
    vocab_size=len(word2id),
    embedding_dim=50,
    learning_rate=0.01
)

epochs = 5
loss_history = []

for epoch in range(epochs):
    total_loss = 0

    for center, context in pairs:
        neg_ids = sample_negative_words(len(word2id), context, K=5)
        loss = model.train_step(center, context, neg_ids)
        total_loss += loss

    avg_loss = total_loss / len(pairs)
    loss_history.append(avg_loss)

    print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}")

Epoch 1/5 - Average Loss: 4.1588
Epoch 2/5 - Average Loss: 4.1588
Epoch 3/5 - Average Loss: 4.1587
Epoch 4/5 - Average Loss: 4.1586
Epoch 5/5 - Average Loss: 4.1585


## Similarity Queries

In [14]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_vector(word):
    return model.W_in[word2id[word]]

def most_similar(word, top_n=5):
    if word not in word2id:
        raise ValueError(f"'{word}' is not in the vocabulary.")

    target_vec = get_vector(word)
    similarities = []

    for w, idx in word2id.items():
        if w == word:
            continue
        sim = cosine_similarity(target_vec, model.W_in[idx])
        similarities.append((w, sim))

    similarities.sort(key=lambda x: -x[1])
    return similarities[:top_n]

In [16]:
list(word2id.keys())[:100]

['words',
 'a',
 'word',
 'to',
 'word2vec',
 'is',
 'text',
 'and',
 'it',
 'in',
 'similar',
 'on',
 'that',
 'converts',
 'into',
 'vectors',
 'semantic',
 'relationships',
 'meanings',
 'continuous',
 'used',
 'sentiment',
 'analysis',
 'search',
 'relevance',
 'of',
 'predicts',
 'target',
 'based',
 'surrounding',
 'context',
 'shallow',
 'two',
 'layer',
 'neural',
 'network',
 'technique',
 'developed',
 'by',
 'google',
 'numerical',
 'embeddings',
 'capture',
 'analyzes',
 'large',
 'corpora',
 'represent',
 'vector',
 'space',
 'where',
 'are',
 'closer',
 'together',
 'primarily',
 'for',
 'nlp',
 'tasks',
 'like',
 'machine',
 'translation',
 'key',
 'aspects',
 'purpose',
 'unstructured',
 'high',
 'dimensional',
 'enabling',
 'mathematical',
 'operations',
 'e',
 'g',
 'king',
 'man',
 'woman',
 'queen',
 'core',
 'principle',
 'appear',
 'contexts',
 'tend',
 'have',
 'architectures',
 'bag',
 'cbow',
 'skip',
 'gram',
 'single',
 'applications',
 'content',
 'recommend

In [17]:
most_similar("understand")

[('embeddings', np.float64(0.3337463300789198)),
 ('developed', np.float64(0.2822875520707103)),
 ('because', np.float64(0.26087842740835915)),
 ('semantic', np.float64(0.2554218900462723)),
 ('and', np.float64(0.2547947462950354))]