# Debiased Word2Vec Model

In [4]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from tqdm.auto import tqdm, trange
from collections import Counter
import random
from torch import optim

from torch.utils.tensorboard import SummaryWriter

# Helpful for computing cosine similarity--Note that this is NOT a similarity!
from scipy.spatial.distance import cosine

# Handy command-line argument parsing
import argparse

# Sort of smart tokenization
from nltk.tokenize import RegexpTokenizer

# We'll use this to save our models
from gensim.models import KeyedVectors

#
# IMPORTANT NOTE: Always set your random seeds when dealing with stochastic
# algorithms as it lets your bugs be reproducible and (more importantly) it lets
# your results be reproducible by others.
#
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

<torch._C.Generator at 0x7fc262561df0>

## Create a class to hold the data

In [5]:
class Corpus:
    
    def __init__(self):

        self.tokenizer = RegexpTokenizer(r'\w+')
        
        # These state variables become populated with function calls
        #
        # 1. load_data()
        # 2. generate_negative_sampling_table()
        #
        # See those functions for how the various values get filled in

        self.word_to_index = {} # word to unique-id
        self.index_to_word = {} # unique-id to word

        # How many times each word occurs in our data after filtering
        self.word_counts = Counter()

        # A utility data structure that lets us quickly sample "negative"
        # instances in a context. This table contains unique-ids
        self.negative_sampling_table = []
        
        # The dataset we'll use for training, as a sequence of unqiue word
        # ids. This is the sequence across all documents after tokens have been
        # randomly subsampled by the word2vec preprocessing step
        self.full_token_sequence_as_ids = None

        # Optional Task 1: Modeling Multi-word Expressions
        self.mwes = {}

        # Optional Task 4: Incorporating Synonyms
        self.synonyms = {}
        
    def tokenize(self, text):
        '''
        Tokenize the document and returns a list of the tokens
        '''
        return self.tokenizer.tokenize(text)        

    def load_data(self, file_name, min_token_freq, mwes_file, synonyms_file):
        '''
        Reads the data from the specified file as long long sequence of text
        (ignoring line breaks) and populates the data structures of this
        word2vec object.
        '''

        # Step 1: Read in the file and create a long sequence of tokens for
        # all tokens in the file
        all_tokens = []
        print('Reading data and tokenizing')
        with open(file_name, 'r', encoding='UTF-8') as file:
            bios = file.read()
        all_tokens = self.tokenize(bios)
        all_tokens = [x.lower() for x in all_tokens]

        # Step 1.1: Read the Multi-word Expressions file and group 
        # multi-world expressions into a single token
        self.load_mwes(mwes_file)
        for i, token in enumerate(all_tokens):
            if token in self.mwes:
                toGroup = True
                for j in range(len(self.mwes[token])):
                    if i + j + 1 >= len(all_tokens) or all_tokens[i + j + 1] != self.mwes[token][j]:
                        toGroup = False
                        break
                if toGroup:
                    for j in range(len(self.mwes[token])):
                        all_tokens[i] += ' ' + self.mwes[token][j]
                        all_tokens[i + j + 1] = '<REMOVE>'
        all_tokens = [token for token in all_tokens if token != '<REMOVE>']    
    
        # Step 2: Count how many tokens we have of each type
        print('Counting token frequencies')
        self.word_counts = Counter(all_tokens)

        # Step 3: Replace all tokens below the specified frequency with an <UNK>
        # token. 
        #
        # NOTE: You can do this step later if needed
        print("Performing minimum thresholding")
        for i, token in enumerate(all_tokens):
            if self.word_counts[token] < min_token_freq:
                all_tokens[i] = '<UNK>'

        # Step 4: update self.word_counts to be the number of times each word
        # occurs (including <UNK>)
        self.word_counts = Counter(all_tokens)
        
        
        # Step 5: Create the mappings from word to unique integer ID and the
        # reverse mapping.
        for i, token in enumerate(list(set(all_tokens))):
            self.index_to_word[i] = token
            self.word_to_index[token] = i

        # Step 5.1: Read the Synonyms file
        self.load_synonyms(synonyms_file)
        
        # Step 6: Compute the probability of keeping any particular *token* of a
        # word in the training sequence, which we'll use to subsample. This subsampling
        # avoids having the training data be filled with many overly common words
        # as positive examples in the context
        total_count = sum(self.word_counts.values())
        word_to_sample_prob = {word: (np.sqrt((count / total_count) / 0.001) + 1) * 0.001 / (count / total_count) for word, count in self.word_counts.items()}
                        
        # Step 7: process the list of tokens (after min-freq filtering) to fill
        # a new list self.full_token_sequence_as_ids where 
        #
        # (1) we probabilistically choose whether to keep each *token* based on the
        # subsampling probabilities (note that this does not mean we drop
        # an entire word!) and 
        #
        # (2) all tokens are convered to their unique ids for faster training.
        #
        # NOTE: You can skip the subsampling part and just do step 2 to get
        # your model up and running.
            
        # NOTE 2: You will perform token-based subsampling based on the probabilities in
        # word_to_sample_prob. When subsampling, you are modifying the sequence itself 
        # (like deleting an item in a list). This action effectively makes the context
        # window  larger for some target words by removing context words that are common
        # from a particular context before the training occurs (which then would now include
        # other words that were previously just outside the window).
        token_ids = []
        for token in all_tokens:
            if word_to_sample_prob[token] < np.random.random():
                continue
            token_ids.append(self.word_to_index[token])
        self.full_token_sequence_as_ids = token_ids

        # Helpful print statement to verify what you've loaded
        print('Loaded all data from %s; saw %d tokens (%d unique)' \
              % (file_name, len(self.full_token_sequence_as_ids),
                 len(self.word_to_index)))


    def load_mwes(self, mwes_file):
        with open(mwes_file, 'r', encoding='UTF-8') as file:
            mwes = file.read().splitlines()

        for mwe in mwes:
            word_list = self.tokenize(mwe)
            word_list = [x.lower() for x in word_list]
            self.mwes[word_list[0]] = word_list[1:]
    

    def load_synonyms(self, synonyms_file):
        with open(synonyms_file, 'r', encoding='UTF-8') as file:
            synonyms = file.read().splitlines()

        for synonym in synonyms:
            word_list = self.tokenize(synonym)
            word_list = [x.lower() for x in word_list]
            word_list_index = []
            for word in word_list:
                if word in self.word_to_index:
                    word_list_index.append(self.word_to_index[word])
            for index in word_list_index:
                self.synonyms[index] = word_list_index
        

    def generate_negative_sampling_table(self, exp_power=0.75, table_size=1e6):
        '''
        Generates a big list data structure that we can quickly randomly index into
        in order to select a negative training example (i.e., a word that was
        *not* present in the context). 
        '''       
        
        # Step 1: Figure out how many instances of each word need to go into the
        # negative sampling table. 
        #
        # HINT: np.power and np.fill might be useful here        
        print("Generating sampling table")
        powers = np.full(len(self.word_to_index), exp_power, dtype=float)
        counts = [self.word_counts[word] for word in self.word_to_index]
        probs = np.power(counts, powers)
        probs /= sum(probs)

        # Step 2: Create the table to the correct size. You'll want this to be a
        # numpy array of type int
        self.negative_sampling_table = [0] * len(self.word_to_index)

        # Step 3: Fill the table so that each word has a number of IDs
        # proportionate to its probability of being sampled.
        #
        # Example: if we have 3 words "a" "b" and "c" with probabilites 0.5,
        # 0.33, 0.16 and a table size of 6 then our table would look like this
        # (before converting the words to IDs):
        #
        # [ "a", "a", "a", "b", "b", "c" ]
        #
        self.negative_sampling_table = np.random.choice(len(self.word_to_index), int(table_size), p=probs)


    def generate_negative_samples(self, cur_context_word_id, num_samples):
        '''
        Randomly samples the specified number of negative samples from the lookup
        table and returns this list of IDs as a numpy array. As a performance
        improvement, avoid sampling a negative example that has the same ID as
        the current positive context word.
        '''

        results = []

        # Create a list and sample from the negative_sampling_table to
        # grow the list to num_samples, avoiding adding a negative example that
        # has the same ID as the current context_word
        for _ in range(num_samples):
            sample = np.random.choice(self.negative_sampling_table)
            while sample == cur_context_word_id:
                sample = np.random.choice(self.negative_sampling_table)
            results.append(sample)

        return results

## Create the corpus

In [6]:
corpus = Corpus()
corpus.load_data('wiki-bios.med.txt', 5, 'bio-mwes.txt', 'synonyms.txt')
corpus.generate_negative_sampling_table()

Reading data and tokenizing
Counting token frequencies
Performing minimum thresholding
Loaded all data from wiki-bios.med.txt; saw 17453348 tokens (100640 unique)
Generating sampling table


## Generate the training data

In [7]:
window_size = 2
num_negative_samples_per_target = 2

training_data = []
    
# Loop through each token in the corpus and generate an instance for each, 
# adding it to training_data
for target_word_index, target_word_id in enumerate(corpus.full_token_sequence_as_ids):

    if corpus.index_to_word[target_word_id] == '<UNK>':
        continue

    if target_word_id in corpus.synonyms:
        target_word_id = np.random.choice(corpus.synonyms[target_word_id])

    # For exach target word in our dataset, select context words 
    # within +/- the window size in the token sequence
    if target_word_index < window_size:
        window_indice = list(range(-target_word_index, 0)) + list(range(1, window_size + 1))
    elif target_word_index > len(corpus.full_token_sequence_as_ids) - window_size - 1:
        window_indice = list(range(-window_size, 0)) + list(range(1, len(corpus.full_token_sequence_as_ids) - target_word_index))
    else:
        window_indice = list(range(-window_size, 0)) + list(range(1, window_size + 1))

    context_ids = [corpus.full_token_sequence_as_ids[target_word_index + i] for i in window_indice]
    predicted_labels = [1 for _ in window_indice]
    
    # For each positive target, we need to select negative examples of
    # words that were not in the context. Use the num_negative_samples_per_target
    # hyperparameter to generate these, using the generate_negative_samples()
    # method from the Corpus class
    negative_ids = []
    for context_id in context_ids:
        negative_ids += corpus.generate_negative_samples(context_id, num_negative_samples_per_target)
        predicted_labels += [0] * num_negative_samples_per_target

    # NOTE: this part might not make sense until later when you do the training 
    # so feel free to revisit it to see why it happens.
    #
    # Our training will use batches of instances together (compare that 
    # with HW1's SGD that used one item at a time). PyTorch will require
    # that all instances in a batches have the same size, which creates an issue
    # for us here since the target wordss at the very beginning or end of the corpus
    # have shorter contexts. 
    # 
    # To work around these edge-cases, we need to ensure that each instance has
    # the same size, which means it needs to have the same number of positive
    # and negative examples. Since we are short on positive examples here (due
    # to the edge of the corpus), we can just add more negative samples.
    #
    # YOUR TASK: determine what is the maximum number of context words (positive
    # and negative) for any instance and then, for instances that have fewer than
    # this number of context words, add in negative examples.
    #
    # NOTE: The maximum is fixed, so you can precompute this outside the loop
    # ahead of time.
    while len(predicted_labels) < window_size * 6:
        context_id = np.random.choice(context_ids)
        negative_ids += [np.random.choice(corpus.generate_negative_samples(context_id, num_negative_samples_per_target))]
        predicted_labels += [0]
    
    target_word_id = np.array([target_word_id])
    word_ids = np.array(context_ids + negative_ids)
    predicted_labels = np.array(predicted_labels)

    training_data += [(target_word_id, word_ids, predicted_labels)]

## Create the network

In [8]:
class Word2Vec(nn.Module):
    
    def __init__(self, vocab_size, embedding_size):
        super(Word2Vec, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        
        # Save what state you want and create the embeddings for your
        # target and context words
        self.target_embeddings = None
        self.context_embeddings = None
        
        # Once created, let's fill the embeddings with non-zero random
        # numbers. We need to do this to get the training started. 
        #
        # NOTE: Why do this? Think about what happens if all the embeddings
        # are all zeros initially. What would the predictions look like for
        # word2vec with these embeddings and how would the updated work?
        
        self.init_emb(init_range=0.5/self.vocab_size)
        
    def init_emb(self, init_range):
        
        # Fill your two embeddings with random numbers uniformly sampled
        # between +/- init_range
        W = torch.tensor(np.random.uniform(-init_range, init_range, [self.vocab_size, self.embedding_size]), dtype = torch.float32)
        self.target_embeddings = nn.Embedding.from_pretrained(W).requires_grad_(True)
        C = torch.tensor(np.random.uniform(-init_range, init_range, [self.vocab_size, self.embedding_size]), dtype = torch.float32)
        self.context_embeddings = nn.Embedding.from_pretrained(C).requires_grad_(True)
        
    def forward(self, target_word_id, context_word_ids):
        ''' 
        Predicts whether each context word was actually in the context of the target word.
        The input is a tensor with a single target word's id and a tensor containing each
        of the context words' ids (this includes both positive and negative examples).
        '''
        
        # NOTE 1: This is probably the hardest part of the homework, so you'll
        # need to figure out how to do the dot-product between embeddings and return
        # the sigmoid. Be prepared for lots of debugging. For some reference,
        # our implementation is three lines and really the hard part is just
        # the last line. However, it's usually a matter of figuring out what
        # that one line looks like that ends up being the hard part.
        
        # NOTE 2: In this homework you'll be dealing with *batches* of instances
        # rather than a single instance at once. PyTorch mostly handles this
        # seamlessly under the hood for you (which is very nice) but batching
        # can show in weird ways and create challenges in debugging initially.
        # For one, your inputs will get an extra dimension. So, for example,
        # if you have a batch size of 4, your input for target_word_id will
        # really be 4 x 1. If you get the embeddings of those targets,
        # it then becomes 4x50! The same applies to the context_word_ids, except
        # that was alreayd a list so now you have things with shape 
        #
        #    (batch x context_words x embedding_size)
        #
        # One of your tasks will be to figure out how to get things lined up
        # so everything "just works". When it does, the code looks surprisingly
        # simple, but it might take a lot of debugging (or not!) to get there.
        
        # NOTE 3: We *strongly* discourage you from looking for existing 
        # implementations of word2vec online. Sadly, having reviewed most of the
        # highly-visible ones, they are actually wrong (wow!) or are doing
        # inefficient things like computing the full softmax instead of doing
        # the negative sampling. Looking at these will likely leave you more
        # confused than if you just tried to figure it out yourself.
        
        # NOTE 4: There many ways to implement this, some more efficient
        # than others. You will want to get it working first and then
        # test the timing to see how long it takes. As long as the
        # code works (vector comparisons look good) you'll receive full
        # credit. However, very slow implementations may take hours(!)
        # to converge so plan ahead.
        
        
        # Hint 1: You may want to review the mathematical operations on how
        # to compute the dot product to see how to do these
        
        # Hint 2: the "dim" argument for some operations may come in handy,
        # depending on your implementation
        
        # Hint 3: printing the shape of the tensors can come in very handy when
        # debugging to see where things aren't lining up
           
        # TODO: Implement the forward pass of word2vec
        v_t = self.target_embeddings(target_word_id)
        v_c = self.target_embeddings(context_word_ids)
        x = torch.matmul(v_t, torch.transpose(v_c, 1, 2))
        outputs = torch.sigmoid(x)
        return torch.squeeze(outputs, 1)

## Train the network

In [11]:
def compute_cosine_similarity(model, word_to_index, word_one, word_two):
    '''
    Computes the cosine similarity between the two words
    '''
    try:
        word_one_index = word_to_index[word_one]
        word_two_index = word_to_index[word_two]
    except KeyError:
        return 0

    embedding_one = model.target_embeddings(torch.LongTensor([word_one_index]))
    embedding_two = model.target_embeddings(torch.LongTensor([word_two_index]))
    similarity = 1 - abs(float(cosine(embedding_one.detach().numpy(),
                                      embedding_two.detach().numpy())))
    return similarity

def some_bias_measuring_function(model):
    women_men = torch.tensor([compute_cosine_similarity(model, corpus.word_to_index, "men", "women")], dtype = torch.float32, requires_grad=True)
    woman_man = torch.tensor([compute_cosine_similarity(model, corpus.word_to_index, "man", "woman")], dtype = torch.float32, requires_grad=True)

    bias1 = torch.tensor([1]) - torch.max(torch.abs(women_men), torch.tensor([0]))
    bias2 = torch.tensor([1]) - torch.max(torch.abs(woman_man), torch.tensor([0]))

    return torch.max(bias1, bias2)

In [12]:
# TODO: Set your training stuff, hyperparameters, models, tensorboard writer etc. here
model = Word2Vec(len(corpus.word_to_index), 50)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_function = nn.BCELoss()
writer = SummaryWriter()

# HINT: wrapping the epoch/step loops in nested tqdm calls is a great way
# to keep track of how fast things are and how much longer training will take

min_loss = 100
min_bias = 100

for epoch in range(1):

    loss_sum = 0
    bias_sum = 0

    # TODO: use your DataLoader to iterate over the data
    for step, data in tqdm(enumerate(DataLoader(training_data, batch_size=256, shuffle=True))):

        optimizer.zero_grad()
        
        # NOTE: since you created the data as a tuple of three np.array instances,
        # these have now been converted to Tensor objects for us
        target_ids, context_ids, labels = data
        labels = labels.float()
        
        # TODO: Fill in all the training details here
        outputs = model(target_ids, context_ids)
        loss1 = loss_function(outputs, labels)
        loss2 = some_bias_measuring_function(model)
        loss = loss1 + loss2
        loss.backward()
        optimizer.step()

        # TODO: Based on the details in the Homework PDF, periodically
        # report the running-sum of the loss to tensorboard. Be sure
        # to reset the running sum after reporting it.
        
        loss_sum += loss1.item()
        bias_sum += loss2.item()

        if (step + 1) % 100 == 0:
            if bias_sum < min_bias and loss_sum < min_loss:
                min_loss = loss_sum
                min_bias = bias_sum
                torch.save(model, 'bios_med_batch_256_debiased.pth')
            writer.add_scalar('Debiased/Loss', loss_sum, step + 1)
            writer.add_scalar('Debiased/Bias', bias_sum, step + 1)
            loss_sum = 0
            bias_sum = 0

# once you finish training, it's good practice to switch to eval.
writer.close()
model.eval()

67704it [1:40:50, 11.19it/s]


Word2Vec(
  (target_embeddings): Embedding(100640, 50)
  (context_embeddings): Embedding(100640, 50)
)

In [1]:
!tensorboard --logdir=runs

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.8.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [4]:
from IPython.display import Image
img1 = 'img/debiased-bias.png'
Image(url=img1)

In [5]:
img2 = 'img/debiased-loss.png'
Image(url=img2)

## Save your model

In [13]:
def save(model, corpus, filename):
    '''
    Saves the model to the specified filename as a gensim KeyedVectors in the
    text format so you can load it separately.
    '''

    # Creates an empty KeyedVectors with our embedding size
    kv = KeyedVectors(vector_size=model.embedding_size)        
    vectors = []
    words = []
    # Get the list of words/vectors in a consistent order
    for index in trange(model.target_embeddings.num_embeddings):
        word = corpus.index_to_word[index]
        vectors.append(model.target_embeddings(torch.LongTensor([index])).detach().numpy()[0])
        words.append(word.replace(' ', '_'))

    # Fills the KV object with our data in the right order
    kv.add_vectors(words, vectors) 
    kv.save_word2vec_format(filename, binary=False)


In [14]:
save(model, corpus, 'bios_med_batch_256_debiased.kv')

100%|██████████| 100640/100640 [00:37<00:00, 2664.96it/s] 
