In [9]:
# ============================================================
# TASK 1: Word2Vec (with & without Neg Sampling) + GloVe
# Dataset: NLTK Reuters Corpus
#
# References:
# - Mikolov et al., 2013 (Word2Vec)
# - Pennington et al., 2014 (GloVe)
# - NLTK Reuters Corpus:
#   https://www.nltk.org/book/ch02.html#the-reuters-corpus
# ============================================================

import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import re
import math
from collections import Counter
from nltk.corpus import reuters

In [10]:
# ------------------------------------------------------------
# 1. DATASET LOADING (FULL REUTERS CORPUS)
# ------------------------------------------------------------
nltk.download("reuters")
nltk.download("punkt")

def load_reuters():
    fileids = reuters.fileids()
    corpus = []
    for fid in fileids:
        words = [w.lower() for w in reuters.words(fid)]
        words = [re.sub(r"[^a-z]", "", w) for w in words]
        words = [w for w in words if w]
        if len(words) > 5:
            corpus.append(words)
    return corpus

corpus = load_reuters()

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/prabidhi/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/prabidhi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# ------------------------------------------------------------
# 2. VOCABULARY
# ------------------------------------------------------------
flatten = lambda l: [item for sub in l for item in sub]
word_counts = Counter(flatten(corpus))

vocabs = list(word_counts.keys())
vocabs.append("<UNK>")

word2index = {w: i for i, w in enumerate(vocabs)}
index2word = {i: w for w, i in word2index.items()}
voc_size = len(vocabs)

In [12]:
# Convert corpus to indices once to save time during training
corpus_indices = [[word2index.get(w, word2index["<UNK>"]) for w in sent] for sent in corpus]

In [13]:
# ------------------------------------------------------------
# 3. DYNAMIC SKIPGRAM GENERATION
# ------------------------------------------------------------
def get_batch_dynamic(corpus_indices, batch_size, max_window=2):
    center_batch, context_batch = [], []
    while len(center_batch) < batch_size:
        sent = random.choice(corpus_indices)
        if len(sent) <= 1: continue
        
        i = random.randint(0, len(sent) - 1)
        center = sent[i]
        
        # Dynamic window size
        w = random.randint(1, max_window)
        
        # Collect context indices
        start = max(0, i - w)
        end = min(len(sent), i + w + 1)
        indices = [sent[j] for j in range(start, end) if j != i]
        
        for context in indices:
            center_batch.append([center])
            context_batch.append(context)
            if len(center_batch) == batch_size: break
            
    return torch.LongTensor(center_batch), torch.LongTensor(context_batch)

In [None]:
# ------------------------------------------------------------
# 4. WORD2VEC — FULL SOFTMAX (NO NEGATIVE SAMPLING)
# ------------------------------------------------------------
class SkipGramSoftmax(nn.Module):
    def __init__(self, voc_size, emb_size):
        super().__init__()
        self.center = nn.Embedding(voc_size, emb_size)
        self.output = nn.Linear(emb_size, voc_size)

    def forward(self, center_word):
        embed = self.center(center_word) # [batch, 1, emb]
        logits = self.output(embed.squeeze(1)) # [batch, voc_size]
        return logits

In [8]:
# Training
emb_size = 100
batch_size = 128
epochs = 5
window_size = 2

skipgrams = get_skipgrams_dynamic(corpus, window_size)

model_sg = SkipGramSoftmax(voc_size, emb_size)
optimizer = optim.Adam(model_sg.parameters(), lr=0.001)
criterion = nn.NLLLoss()

for epoch in range(epochs):
    loss_sum = 0
    for _ in range(len(skipgrams) // batch_size):
        center, context = random_batch(batch_size, skipgrams)
        log_probs = model_sg(center)
        loss = criterion(log_probs.squeeze(1), context.squeeze())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
    print(f"Softmax Epoch {epoch+1} | Loss: {loss_sum:.4f}")

KeyboardInterrupt: 

In [None]:
# ------------------------------------------------------------
# 5. WORD2VEC — NEGATIVE SAMPLING
# ------------------------------------------------------------
def build_unigram_table(word_counts, table_size=1_000_000):
    pow_freq = np.array([word_counts[w]**0.75 for w in vocabs])
    ratios = pow_freq / pow_freq.sum()
    counts = np.round(ratios * table_size)
    table = []
    for i, c in enumerate(counts):
        table += [i] * int(c)
    return table

unigram_table = build_unigram_table(word_counts)

In [None]:
class SkipGramNeg(nn.Module):
    def __init__(self, voc_size, emb_size):
        super().__init__()
        self.center = nn.Embedding(voc_size, emb_size)
        self.outside = nn.Embedding(voc_size, emb_size)

    def forward(self, center, pos, neg):
        v = self.center(center)
        pos_score = torch.bmm(self.outside(pos), v.transpose(1,2)).squeeze()
        neg_score = torch.bmm(self.outside(neg), -v.transpose(1,2)).squeeze()
        loss = - (torch.log(torch.sigmoid(pos_score)) +
                  torch.sum(torch.log(torch.sigmoid(neg_score)), dim=1))
        return loss.mean()


In [None]:
def negative_samples(batch_size, k):
    return torch.LongTensor(
        [[random.choice(unigram_table) for _ in range(k)] for _ in range(batch_size)]
    )

model_neg = SkipGramNeg(voc_size, emb_size)
optimizer = optim.Adam(model_neg.parameters(), lr=0.001)
k = 5

for epoch in range(epochs):
    loss_sum = 0
    for _ in range(len(skipgrams) // batch_size):
        center, context = random_batch(batch_size, skipgrams)
        neg = negative_samples(batch_size, k)
        loss = model_neg(center, context, neg)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
    print(f"NEG Epoch {epoch+1} | Loss: {loss_sum:.4f}")

In [None]:
# ------------------------------------------------------------
# 6. GLOVE FROM SCRATCH
# ------------------------------------------------------------
def build_cooccurrence(corpus, window):
    cooc = Counter()
    for sent in corpus:
        for i, w in enumerate(sent):
            for j in range(max(0,i-window), min(len(sent),i+window+1)):
                if i != j:
                    cooc[(w, sent[j])] += 1
    return cooc

class GloVe(nn.Module):
    def __init__(self, voc_size, emb_size):
        super().__init__()
        self.wi = nn.Embedding(voc_size, emb_size)
        self.wj = nn.Embedding(voc_size, emb_size)
        self.bi = nn.Embedding(voc_size, 1)
        self.bj = nn.Embedding(voc_size, 1)

    def forward(self, i, j, x, w):
        dot = (self.wi(i) * self.wj(j)).sum(1)
        loss = w * (dot + self.bi(i).squeeze() + self.bj(j).squeeze() - x)**2
        return loss.mean()

cooc = build_cooccurrence(corpus, window_size)
pairs = list(cooc.items())

model_glove = GloVe(voc_size, emb_size)
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)

def glove_batch(batch_size):
    batch = random.sample(pairs, batch_size)
    i, j, x, w = [], [], [], []
    for (wi, wj), c in batch:
        i.append(word2index[wi])
        j.append(word2index[wj])
        x.append(math.log(c))
        w.append(min(1.0, (c/100)**0.75))
    return (torch.LongTensor(i), torch.LongTensor(j),
            torch.FloatTensor(x), torch.FloatTensor(w))

for epoch in range(epochs):
    loss_sum = 0
    for _ in range(len(pairs)//batch_size):
        i,j,x,w = glove_batch(batch_size)
        loss = model_glove(i,j,x,w)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
    print(f"GloVe Epoch {epoch+1} | Loss: {loss_sum:.4f}")