# Lab 8_1 Word2Vec

In [None]:
import re
import nltk
nltk.download('brown')
from nltk.corpus import brown
import itertools

corpus = []

for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])

In [None]:
from collections import Counter
import random, math

def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(list(word_counts.values()))
    word_counts = {word: word_counts[word]/float(sum_word_counts) for word in word_counts}
    for text in corpus:
        filtered_corpus.append([])
        for word in text:
            if random.random() < (1+math.sqrt(word_counts[word] * 1e3)) * 1e-3 / float(word_counts[word]):
                filtered_corpus[-1].append(word)
    return filtered_corpus

In [None]:
corpus = subsample_frequent_words(corpus)
vocabulary = set(itertools.chain.from_iterable(corpus))

word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}

In [None]:
import numpy as np

context_tuple_list = []
w = 4

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j]))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

In [None]:
import torch
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

context_tensor_list = []
target_tensor_list = []

# Create context tensors and target tensors, then move them to GPU
for target, context in context_tuple_list:
    target_tensor = torch.LongTensor([word_to_index[target]])
    context_tensor = torch.LongTensor([word_to_index[context]])
    target_tensor_list.append(target_tensor)
    context_tensor_list.append(context_tensor)

# Convert the lists to tensors
target_tensor = torch.cat(target_tensor_list, dim=0).to(device)
context_tensor = torch.cat(context_tensor_list, dim=0).to(device)

In [None]:
import torch.nn  as  nn
import torch.autograd  as autograd
import torch.optim as optim
import torch.nn.functional as F


class Word2Vec(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, context_word):
        emb = self.embeddings(context_word)
        hidden = self.linear(emb)
        out = F.log_softmax(hidden, dim=1)  # Specify dim=1 to apply over vocab size
        return out

In [None]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.

    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]

    def stop_training(self):
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print(", Loss gain: {}%".format(round(100*gain,2)))
        if len(self.loss_list) == 1:
            return False
        if gain < self.min_percent_gain:
            return True
        else:
            return False

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Vocabulary size and initialization
vocabulary_size = len(vocabulary)

# Max no. of epochs
epochs = 100

# Batch size (modify this according to your preference)
batch_size = 1024

# Create the Word2Vec model and move it to the GPU
net = Word2Vec(embedding_size=2, vocab_size=vocabulary_size).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping(patience=4, min_percent_gain=0.2)

# Create a DataLoader for batching
dataset = TensorDataset(context_tensor, target_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
import time
# Training loop
for epoch in range(epochs):
    start_time = time.time()  # Start time for epoch

    losses = []
    for context_batch, target_batch in dataloader:
        net.zero_grad()

        # Forward pass (context_batch and target_batch are already on GPU)
        log_probs = net(context_batch)

        # Compute loss
        loss = loss_function(log_probs, target_batch)

        # Backpropagation
        loss.backward()

        # Update the model parameters
        optimizer.step()

        losses.append(loss.data.cpu().numpy())  # Move loss to CPU for printing

    # Calculate elapsed time
    epoch_time = time.time() - start_time  # Time taken for this epoch

    # Print the epoch number, time taken, and loss with 4 decimal precision
    print(f"Epoch [{epoch + 1}/{epochs}], Time: {epoch_time:.4f}s, Loss: {np.mean(losses):.5f}", end="")

    # Update early stopping
    early_stopping.update_loss(np.mean(losses))

    # Check for early stopping condition
    if early_stopping.stop_training():
        print(f"Early stopping at epoch {epoch + 1}")
        break


In [None]:
import random

def get_batches(context_tuple_list, batch_size=128):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    for i in range(len(context_tuple_list)):
        batch_target.append(word_to_index[context_tuple_list[i][0]])
        batch_context.append(word_to_index[context_tuple_list[i][1]])
        batch_negative.append([word_to_index[w] for w in context_tuple_list[i][2]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
            tensor_context = autograd.Variable(torch.from_numpy(np.array(batch_context)).long())
            tensor_negative = autograd.Variable(torch.from_numpy(np.array(batch_negative)).long())
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

In [None]:
from numpy.random import multinomial
import time

def sample_negative(sample_size):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            word_list.extend([words[index]] * count)  # Extend list efficiently
        yield word_list

In [None]:
import numpy as np

context_tuple_list = []
w = 4
negative_samples = sample_negative(8)

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j], next(negative_samples)))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

In [None]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F


class Word2Vec(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        emb_product = torch.mul(emb_target, emb_context)
        emb_product = torch.sum(emb_product, dim=1)
        out = torch.sum(F.logsigmoid(emb_product))
        emb_negative = self.embeddings_context(negative_example)
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))
        return -out

In [None]:
import time

vocabulary_size = len(vocabulary)

loss_function = nn.CrossEntropyLoss()
net = Word2Vec(embedding_size=200, vocab_size=vocabulary_size)
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

i=0
while i<5:
    i+=1
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for i in range(len(context_tuple_batches)):
        net.zero_grad()
        target_tensor, context_tensor, negative_tensor = context_tuple_batches[i]
        loss = net(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.data)
    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break

In [None]:
import numpy as np

def get_closest_word(word, topn=5):
    word_distance = []
    emb = net.embeddings_target
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long)
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)
            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

In [None]:
print("Closest words to 'the':", get_closest_word('the'))