# Word2Vec (Negative Sampling)

In [62]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import nltk
import torch.nn as nn
import torch.optim as optim
import matplotlib
import time
import nltk.corpus 
from nltk.corpus import brown

In [63]:
np.__version__, torch.__version__

('2.3.5', '2.5.1+cu121')

In [64]:
import matplotlib
matplotlib.__version__

'3.10.8'

## 1. Load data

In [65]:
# Load brown corpus (real-world data for final training)
nltk.download('brown')
nltk.download('punkt')
sentences = brown.sents(categories = "news")
sentences = [[word.lower() for word in sent] for sent in sentences]


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [66]:
corpus = []
sentences = brown.sents(categories="news")
corpus = [' '.join(sent).lower() for sent in sentences]

In [67]:
corpus[0][0]

't'

In [68]:
#1. tokenization
corpus
corpus = [sent.split(" ") for sent in corpus]

In [69]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(corpus))) #all the words we have in the system - <UNK>

In [70]:
#numericalization
word2index = {w: i for i, w in enumerate(vocabs)}
print(word2index)



In [71]:
# Step 2: Tokenize brown corpus

tokenized_sentences = sentences

print("Number of tokenized sentences:", len(tokenized_sentences))
print("Sample tokenized sentence (first 30 words):")
print(tokenized_sentences[0][:30])


vocabs.append('<UNK>')
word2index['<UNK>'] = len(vocabs) - 1

Number of tokenized sentences: 4623
Sample tokenized sentence (first 30 words):
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [72]:
word2index 

{'pennsylvania': 0,
 'frustrations': 1,
 "flock's": 2,
 'chandler': 3,
 'precipitated': 4,
 'eliminates': 5,
 'ocean': 6,
 'convenient': 7,
 'zeising': 8,
 'dapper': 9,
 'sofas': 10,
 'movement': 11,
 'booklists': 12,
 'constitutional': 13,
 'educators': 14,
 'destroy': 15,
 'summerdale': 16,
 'long-time': 17,
 'transportation': 18,
 'informal': 19,
 'purpose': 20,
 'correspondents': 21,
 'malmud': 22,
 'particulars': 23,
 'roller': 24,
 'surveys': 25,
 '80': 26,
 'thereby': 27,
 'knoll': 28,
 'collonaded': 29,
 'while': 30,
 'federation': 31,
 'implementation': 32,
 'missiles': 33,
 'beautifully': 34,
 'squared': 35,
 '1,512': 36,
 'coasted': 37,
 'reporters': 38,
 'treaty': 39,
 'details': 40,
 'moore': 41,
 'brocade': 42,
 'north': 43,
 'balance': 44,
 'barber': 45,
 'second-place': 46,
 'forming': 47,
 'compiled': 48,
 'strengthened': 49,
 'leonard': 50,
 '2-baser': 51,
 'spahnie': 52,
 "taxpayer's": 53,
 'hoover': 54,
 'reformatory': 55,
 'sterling': 56,
 'captured': 57,
 '70': 58

In [73]:
def generate_context(sentence, center_idx, window_size=2):
    """
    Generate context words for a given center word index using a dynamic window size.
    """
    start = max(0, center_idx - window_size)
    end = min(len(sentence), center_idx + window_size + 1)
    return [sentence[j] for j in range(start, end) if j != center_idx]


def generate_skipgram_pairs(sentences, window_size=2):
    """
    Generate (center, context) pairs for Skip-gram training.
    """
    pairs = []
    for sentence in sentences:
        for i, center in enumerate(sentence):
            context_words = generate_context(sentence, i, window_size)
            for context in context_words:
                pairs.append((center, context))
    return pairs


# Default window size
window_size = 2

# Generate Skip-gram training pairs
pairs = generate_skipgram_pairs(tokenized_sentences, window_size)
print("Number of (center, context) pairs generated:", len(pairs))
print("Sample (center, context) pairs:", pairs[:10])

# index2word = {v:k for k, v in word2index.items()}
# index2word[5]

Number of (center, context) pairs generated: 374548
Sample (center, context) pairs: [('The', 'Fulton'), ('The', 'County'), ('Fulton', 'The'), ('Fulton', 'County'), ('Fulton', 'Grand'), ('County', 'The'), ('County', 'Fulton'), ('County', 'Grand'), ('County', 'Jury'), ('Grand', 'Fulton')]


In [74]:
corpus = flatten(corpus)

## 2. Prepare train data

In [75]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, window_size=2):
    if window_size < 1:
        raise ValueError("window_size must be >= 1")

    corpus_len = len(corpus)
    if corpus_len <= 2 * window_size:
        raise ValueError("corpus too small for given window_size")

    # valid center indices that have full window on both sides
    low = window_size
    high = corpus_len - window_size

    centers = np.random.randint(low, high, size=batch_size)
    inputs, labels = [], []

    for i in centers:
        center_word = corpus[i]
        # choose a random outside within the window (excluding center)
        offsets = list(range(-window_size, 0)) + list(range(1, window_size + 1))
        offset = np.random.choice(offsets)
        outside_word = corpus[i + offset]

        c_idx = word2index.get(center_word, word2index['<UNK>'])
        o_idx = word2index.get(outside_word, word2index['<UNK>'])

        inputs.append([c_idx])
        labels.append([o_idx])

    return np.array(inputs, dtype=np.int64), np.array(labels, dtype=np.int64)
            
x, y = random_batch(2, corpus)



In [76]:
x.shape  #batch_size, 1

(2, 1)

In [77]:
x

array([[11653],
       [ 4959]])

In [78]:
y.shape  #batch_size 1

(2, 1)

## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [79]:
z = 0.001

In [80]:
#count
from collections import Counter

word_count = Counter(corpus)
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words


100554

In [81]:
vocabs

['pennsylvania',
 'frustrations',
 "flock's",
 'chandler',
 'precipitated',
 'eliminates',
 'ocean',
 'convenient',
 'zeising',
 'dapper',
 'sofas',
 'movement',
 'booklists',
 'constitutional',
 'educators',
 'destroy',
 'summerdale',
 'long-time',
 'transportation',
 'informal',
 'purpose',
 'correspondents',
 'malmud',
 'particulars',
 'roller',
 'surveys',
 '80',
 'thereby',
 'knoll',
 'collonaded',
 'while',
 'federation',
 'implementation',
 'missiles',
 'beautifully',
 'squared',
 '1,512',
 'coasted',
 'reporters',
 'treaty',
 'details',
 'moore',
 'brocade',
 'north',
 'balance',
 'barber',
 'second-place',
 'forming',
 'compiled',
 'strengthened',
 'leonard',
 '2-baser',
 'spahnie',
 "taxpayer's",
 'hoover',
 'reformatory',
 'sterling',
 'captured',
 '70',
 'youths',
 'submarine-ball',
 'gen.',
 'karns',
 '17-1/2-inch',
 'weigh',
 'tulsa',
 'future',
 'wholly-owned',
 'historian',
 'prosecutor',
 'genuine',
 'bass',
 'boston',
 'mont.',
 'peculiar',
 'salesman',
 'result',
 'f

$$P(w)=U(w)^{3/4}/Z$$

In [82]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'the': 126,
         ',': 108,
         '.': 89,
         'of': 69,
         'and': 56,
         'a': 55,
         'to': 55,
         'in': 53,
         'for': 30,
         'that': 27,
         'was': 24,
         '``': 24,
         'is': 24,
         "''": 24,
         'on': 23,
         'he': 22,
         'at': 22,
         'with': 20,
         'as': 19,
         'be': 19,
         'by': 18,
         'it': 18,
         'said': 16,
         'his': 16,
         'will': 15,
         'from': 14,
         'are': 13,
         'an': 13,
         'this': 13,
         ';': 13,
         'had': 12,
         '--': 12,
         'but': 12,
         'has': 12,
         'were': 11,
         'not': 11,
         'mrs.': 11,
         'they': 11,
         'would': 11,
         'who': 11,
         'have': 11,
         'their': 10,
         'which': 10,
         'new': 10,
         'there': 9,
         'been': 9,
         'one': 9,
         '(': 8,
         'more': 8,
         'all': 8,
         

## 4. Model

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [83]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)

In [84]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [85]:
batch_size = 8
x, y = random_batch(batch_size, corpus)
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)

In [86]:
k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [87]:
y_tensor[1]

tensor([12411])

In [88]:
neg_samples[1]

tensor([ 4822,  7439,  4822, 11578,  2869])

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [89]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

In [90]:
#test your model
emb_size = 10
voc_size = len(vocabs)
model = SkipgramNeg(voc_size, emb_size)

In [91]:
loss = model(x_tensor, y_tensor, neg_samples)

In [92]:
loss

tensor(3.3728, grad_fn=<NegBackward0>)

## 5. Training

In [93]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [94]:
import time

losses = []
start_time = time.time()

num_epochs = 1000

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    losses.append(loss.item())
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

end_time = time.time()
training_time = end_time - start_time




Epoch   1000 | Loss: 3.654001


In [95]:
print("Final loss:", losses[-1])
print("Training time (s):", training_time)

Final loss: 3.654000759124756
Training time (s): 2.1021718978881836


## 6. Plot the embeddings

In [96]:
vocabs

['pennsylvania',
 'frustrations',
 "flock's",
 'chandler',
 'precipitated',
 'eliminates',
 'ocean',
 'convenient',
 'zeising',
 'dapper',
 'sofas',
 'movement',
 'booklists',
 'constitutional',
 'educators',
 'destroy',
 'summerdale',
 'long-time',
 'transportation',
 'informal',
 'purpose',
 'correspondents',
 'malmud',
 'particulars',
 'roller',
 'surveys',
 '80',
 'thereby',
 'knoll',
 'collonaded',
 'while',
 'federation',
 'implementation',
 'missiles',
 'beautifully',
 'squared',
 '1,512',
 'coasted',
 'reporters',
 'treaty',
 'details',
 'moore',
 'brocade',
 'north',
 'balance',
 'barber',
 'second-place',
 'forming',
 'compiled',
 'strengthened',
 'leonard',
 '2-baser',
 'spahnie',
 "taxpayer's",
 'hoover',
 'reformatory',
 'sterling',
 'captured',
 '70',
 'youths',
 'submarine-ball',
 'gen.',
 'karns',
 '17-1/2-inch',
 'weigh',
 'tulsa',
 'future',
 'wholly-owned',
 'historian',
 'prosecutor',
 'genuine',
 'bass',
 'boston',
 'mont.',
 'peculiar',
 'salesman',
 'result',
 'f

In [97]:
jobs = torch.LongTensor([word2index['jobs']])
jobs

tensor([7651])

In [98]:
jobs_embed_c = model.embedding_center(jobs)
jobs_embed_o = model.embedding_outside(jobs)
jobs_embed   = (jobs_embed_c + jobs_embed_o) / 2
jobs_embed

tensor([[-0.6840,  0.4285, -0.1019,  0.5842, -0.2463,  0.5857,  0.2184,  0.7996,
         -0.3702, -1.2857]], grad_fn=<DivBackward0>)

In [99]:
jobs_embed_o

tensor([[-1.6684, -0.4668,  0.1689, -0.4969, -0.1411,  0.1998,  0.9985,  0.4364,
         -0.8788, -1.4734]], grad_fn=<EmbeddingBackward0>)

In [100]:
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
        
    word = torch.LongTensor([word2index[word]])
    
    embed_c = model.embedding_center(word)
    embed_o = model.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    return embed[0][0].item(), embed[0][1].item()

In [101]:
get_embed('vacancy')

(1.1550911664962769, 0.008116543292999268)

In [102]:
get_embed('death')

(-0.4139266908168793, 0.49589985609054565)

In [103]:
get_embed('architecture')

(-0.8513450622558594, 0.2619168758392334)

In [104]:
get_embed('jobs')

(-0.6839858889579773, 0.42847001552581787)

In [105]:
# plt.figure(figsize=(6, 3))
# for i, word in enumerate(vocabs):
#     x, y = get_embed(word)
#     plt.scatter(x, y)
#     plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')
# plt.show()

## 7. Cosine similarity

In [106]:
jobs = get_embed('jobs')
jobs

(-0.6839858889579773, 0.42847001552581787)

In [107]:
vacancy = get_embed('vacancy')
vacancy

(1.1550911664962769, 0.008116543292999268)

In [108]:
death = get_embed('death')
death

(-0.4139266908168793, 0.49589985609054565)

In [109]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

print(cosine_similarity(np.array(jobs), np.array(vacancy)))
print(cosine_similarity(np.array(jobs), np.array(death)))

-0.8437016633192334
0.9506024379657818


In [110]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


# Step 1: build final embedding matrix
W = (model.embedding_center.weight.detach().cpu().numpy() +
     model.embedding_outside.weight.detach().cpu().numpy()) / 2

# Normalize embeddings for fast cosine similarity
W_norm = W / np.linalg.norm(W, axis=1, keepdims=True)



def get_vector(word):
    if word not in word2index:
        return None
    return W_norm[word2index[word]] 


In [111]:
print(W.shape)
print(get_vector("jobs"))
print(len(get_vector("jobs")))


(13113, 10)
[-0.34706408  0.21741173 -0.05169694  0.29644683 -0.12497501  0.29719865
  0.11080109  0.40571707 -0.18783988 -0.6523899 ]
10


In [112]:
index2word = {idx: word for word, idx in word2index.items()}
def predict_analogy(a, b, c, W_norm, word2index, index2word):
    if a not in word2index or b not in word2index or c not in word2index:
        return None

    va = W_norm[word2index[a]]
    vb = W_norm[word2index[b]]
    vc = W_norm[word2index[c]]

    # Vector arithmetic: b - a + c
    target = vb - va + vc
    target = target / np.linalg.norm(target)

    # Cosine similarity with ALL words at once
    similarities = np.dot(W_norm, target)

    # Exclude input words
    for w in (a, b, c):
        similarities[word2index[w]] = -1

    best_index = np.argmax(similarities)
    return index2word[best_index]



In [113]:
predict_analogy("jobs", "vacancy", "death", W, word2index, index2word)


'holds'

In [114]:

def evaluate_analogies(file_path, W_norm, word2index, index2word):
    total = 0
    correct = 0

    with open(file_path, "r") as f:
        for line in f:
            words = line.strip().split()
            if len(words) != 4:
                continue

            a, b, c, d = words
            prediction = predict_analogy(
                a, b, c, W_norm, word2index, index2word
            )

            if prediction is None:
                continue

            total += 1
            if prediction == d:
                correct += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy, correct, total


In [115]:
semantic_acc, sem_correct, sem_total = evaluate_analogies(
    "country-capital.txt",
    W,
    word2index,
    index2word
)


In [116]:
syntactic_acc, syn_correct, syn_total = evaluate_analogies(
    "past-tense.txt",
    W,
    word2index,
    index2word
)

print(f"Semantic accuracy (capital-common-countries): {semantic_acc:.4f} ({sem_correct}/{sem_total})")
print(f"Syntactic accuracy (past-tense): {syntactic_acc:.4f} ({syn_correct}/{syn_total})")


Semantic accuracy (capital-common-countries): 0.0000 (0/0)
Syntactic accuracy (past-tense): 0.0000 (0/648)


In [117]:
import pandas as pd

similarity_df = pd.read_csv("combined.csv")
similarity_df.head()


Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


In [118]:
model_scores = []
human_scores = []
skipped = 0

for _, row in similarity_df.iterrows():
    w1 = row["Word 1"]
    w2 = row["Word 2"]
    human_score = row["Human (mean)"]

    if w1 not in word2index or w2 not in word2index:
        skipped += 1
        continue

    v1 = W_norm[word2index[w1]]
    v2 = W_norm[word2index[w2]]

    model_sim = np.dot(v1, v2)  # cosine similarity

    model_scores.append(model_sim)
    human_scores.append(human_score)

print(f"Used pairs: {len(model_scores)}")
print(f"Skipped OOV pairs: {skipped}")


Used pairs: 197
Skipped OOV pairs: 156


In [119]:

from scipy.stats import spearmanr

correlation, p_value = spearmanr(model_scores, human_scores)

print(f"Spearman correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4e}")


Spearman correlation: 0.0276
P-value: 7.0055e-01


In [120]:
print("Example comparisons:")
for i in range(5):
    print(
        similarity_df.iloc[i, 0],
        similarity_df.iloc[i, 1],
        "Human:", human_scores[i],
        "Model:", round(model_scores[i], 3)
    )


Example comparisons:
love sex Human: 6.77 Model: -0.226
tiger cat Human: 7.46 Model: 0.095
tiger tiger Human: 5.77 Model: 0.166
book paper Human: 6.31 Model: -0.799
computer keyboard Human: 7.5 Model: -0.059


## Summary: Word2Vec with Negative Sampling

This notebook implements the **Word2Vec (Skip-gram)** model using **Negative Sampling** to efficiently learn word embeddings from the NLTK Reuters corpus.

### Key Components:
* **Data Preprocessing**: 
    * Loaded the `reuters` dataset and performed tokenization (lowercase, alphabetic filtering).
    * Implemented numeralization and vocabulary building to map tokens to unique integers.
* **Training Strategy**:
    * Utilized **Negative Sampling** to optimize the training process by updating only a small percentage of weights for each iteration rather than the entire vocabulary.
    * Defined a context window for generating positive pairs and sampled "noise" words for negative pairs.
* **Model Architecture**: 
    * Built using `torch.nn`, featuring separate embedding layers for center and context words.
* **Evaluation**:
    * Evaluated the learned embeddings using the **WordSim-353** dataset.
    * Calculated **Spearman Correlation** between model-generated cosine similarities and human-assigned similarity scores.
    * **Results**: The model demonstrates its ability to capture semantic relationships (e.g., `tiger` vs `cat`), though performance varies based on training duration and corpus size.


In [121]:
import json
import pickle

# Export embeddings as JSON
embeddings_dict = {
    "W": W.tolist(),
    "W_norm": W_norm.tolist(),
    "word2index": word2index,
    "index2word": {str(k): v for k, v in index2word.items()}
}

with open("output/skipgram_neg_embeddings.json", "w") as f:
    json.dump(embeddings_dict, f)

# Export corpus
with open("output/corpus.json", "w") as f:
    json.dump(corpus, f)

# Export as pickle for faster loading
with open("output/skipgram_neg_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_dict, f)

with open("output/corpus.pkl", "wb") as f:
    pickle.dump(corpus, f)

print("Embeddings and corpus exported successfully!")
print(f"Embeddings shape: {W.shape}")
print(f"Corpus size: {len(corpus)} words")

Embeddings and corpus exported successfully!
Embeddings shape: (13113, 10)
Corpus size: 100554 words
