# Word2Vec with Negative Sampling Loss

### Preprocess corpus

Map each word to a unique Id and calculate word frequency, unigram distribution, subsampling frequency and negative sampling frequency of each word.

In [111]:
import numpy as np
import tensorflow as tf
from nltk.corpus import reuters
from collections import Counter, defaultdict
import json
import os
from scipy.stats import spearmanr

In [112]:
def preprocess_corpus(corpus_words, ignore_threshold=5):
    # Consider only alphabetic words.
    words = [wrd.lower() for wrd in corpus_words if wrd.isalpha()]
    word_freq = Counter(words)
    
    # Ignore words which occur less than threshold times in all documents.
    # Replace them by a special UNK word.
    if ignore_threshold > 0:
        unk_word = 'UNK'
        unk_cnt = 0
        unique_words = list(word_freq.keys())
        for wrd in unique_words:
            if word_freq[wrd] < ignore_threshold:
                unk_cnt += word_freq[wrd]
                del word_freq[wrd]
    
    # Create a Word to ID map.
    total_words = sum(cnt for wrd, cnt in word_freq.items())
    increment = 1 if ignore_threshold > 0 else 0
    word_to_id = {wrd: (i+increment) for i, wrd in enumerate(word_freq)}
    if ignore_threshold > 0:
        word_to_id[unk_word] = 0
    
    # Generate Pn and Ps probabilities.
    total_pn = sum(np.power(float(word_freq[wrd])/total_words, 0.75) for wrd in word_freq)
    word_metadata = {
        word_to_id[wrd]: {
            'freq': word_freq[wrd],
            'Uw': float(word_freq[wrd])/total_words,
            'Ps': 1 - np.sqrt(1e-5 / (float(word_freq[wrd])/total_words)),
            'Pn': np.power(float(word_freq[wrd])/total_words, 0.75) / total_pn,
        } for wrd in word_freq
    }
    
    if ignore_threshold > 0:
        word_metadata[0] = {
            'freq': unk_cnt, 'Uw': 0, 'Ps': 1 - np.sqrt(1e-5 / unk_cnt) , 'Pn': 0
        }
    
    # For numerical stability, due to insufficient precision in float.
    cdf = 0.0
    for wrd in word_metadata:
        cdf += word_metadata[wrd]['Pn']
        word_metadata[wrd]['cdf'] = cdf
    return word_to_id, word_metadata


### Batch Generator

Batch Generator generates a minibatch to train. Output is (x, y) where x is input word and first word of y is target word whereas rest of the words in y are negative samples.

In [113]:
class BatchGenerator(object):
    def __init__(self, corpus, batch_size, split_ratio=0.8, window_size=3, loaded_data=None):
        self.corpus = corpus
        self.ids = corpus.fileids()
        self.batch_size = batch_size
        self.corpus_words = self.corpus.words()
        if loaded_data is None:
            self.word_to_id, self.word_metadata = preprocess_corpus(self.corpus_words)
        else:
            self.word_to_id = loaded_data['word_to_id']
            self.word_metadata = {int(k): v for k, v in loaded_data['word_metadata'].items()}
        self.n_words = len(self.word_to_id)
        self.split_ratio = split_ratio
        self.window_size = window_size
        self.n_steps = 0
        self.splitted = False
        self._cursor = self.window_size + 1
        self.word_ids = [self.word_to_id[wrd] for wrd in self.corpus_words if wrd in self.word_to_id]
        del self.corpus_words
    
    def set_batch_size(self, new_size):
        self.batch_size = new_size
    
    def split(self):
        self.shuffle_data()
        self.splitted = True
        instances = len(self.ids)
        i = int(self.split_ratio*instances)
        self.train_ids = self.ids[:i]
        self.valid_ids = self.ids[i:]
        self.train_word_ids = [self.word_to_id[wrd] for wrd in self.corpus.words(self.train_ids) if wrd in self.word_to_id]
        self.n_train_words = len(self.train_word_ids)
        self.validation_word_ids = [self.word_to_id[wrd] for wrd in self.corpus.words(self.train_ids) if wrd in self.word_to_id]
        self.n_validation_words = len(self.validation_word_ids)
    
    def get_validation_data(self):
        if self.splitted:
            return self.valid_data, self.valid_labels
        else:
            return None
    
    def __iter__(self):
        self._cursor = self.window_size + 1
        return self
    
    def get_negative_probs(self):
        return [self.word_metadata[wrd]['Pn'] for wrd in sorted(self.word_metadata.keys())]
    
    def _increment_cursor(self, max_len):
        self._cursor = (self._cursor + 1) % (max_len)
        if self._cursor == 0:
            self._cursor = self.window_size + 1
            return True
        return False
    
    @classmethod
    def load_from_metadata(cls, fname, corpus, batch_size, split_ratio=0.8, window_size=3):
        f = open(fname, 'r')
        data = json.loads(f.read())
        f.close()
        return cls(corpus, batch_size, split_ratio=split_ratio, window_size=window_size, loaded_data=data)
        
    def store_metadata(self, fname):
        data = {
            'word_to_id': self.word_to_id,
            'word_metadata': self.word_metadata
        }
        f = open(fname, 'w')
        f.write(json.dumps(data))
        f.close()

    def __next__(self):
        word_ids = self.word_ids if not self.splitted else self.train_word_ids
        batch_input = []
        batch_output = []
        Pds = np.random.uniform(size=(self.batch_size))
        sample_cnt = 0
        while len(batch_input) < self.batch_size:
            wrd = word_ids[self._cursor]
            
            if Pds[sample_cnt] > self.word_metadata[wrd]['Ps']:
                batch_input.append(wrd)
                targets = word_ids[self._cursor - self.window_size: self._cursor]
                epoch = self._increment_cursor(len(word_ids))
                targets += word_ids[self._cursor: self._cursor + self.window_size]
                batch_output.append(targets)
            
            sample_cnt += 1
            if sample_cnt == self.batch_size:
                Pds = np.random.uniform(size=(self.batch_size))
                sample_cnt = 0
        
        return np.array(batch_input).reshape(-1, 1).astype(np.int32), np.array(batch_output), epoch
    
    def next(self):
        return self.__next__()


In [114]:
class PairBatchGenerator(BatchGenerator):
    def __init__(self, corpus, batch_size, split_ratio=0.8, window_size=3, loaded_data=None):
        super(PairBatchGenerator, self).__init__(corpus, batch_size, split_ratio=split_ratio, window_size=window_size, loaded_data=loaded_data)
        self._generate_pairs()
        self.shuffle()
    
    def shuffle(self):
        np.random.shuffle(self.pairs)
        
    def _generate_pairs(self):
        word_pairs = defaultdict(set)
        for idx, wrd in enumerate(self.word_ids):
            p = np.random.uniform()
            if p > self.word_metadata[wrd]['Ps']:
                lidx = max(0, idx - self.window_size)
                ridx = min(len(self.word_ids), idx + 1 + self.window_size)
                word_pairs[wrd].update(self.word_ids[lidx:ridx])
        
        pairs = []
        for wrd in word_pairs:
            for owrd in word_pairs[wrd]:
                pairs.append((wrd, owrd))
        
        self.pairs = pairs
    
    def __iter__(self):
        self._cursor = 0
        return self
    
    def __next__(self):
        if self._cursor + self.batch_size >= len(self.pairs):
            rem = (self._cursor + self.batch_size) - len(self.pairs)
            batch = np.array(self.pairs[self._cursor:] + self.pairs[:rem])
            self._cursor = rem
            return batch[:, 0].reshape(-1, 1), batch[:, 1].reshape(-1, 1), True
        _pcursor = self._cursor
        batch = np.array(self.pairs[self._cursor: self._cursor + self.batch_size])
        self._cursor = (self._cursor + self.batch_size) % len(self.pairs)
        return batch[:, 0].reshape(-1, 1), batch[:, 1].reshape(-1, 1), True if _pcursor >= self._cursor else False
        

In [115]:
bg = PairBatchGenerator(reuters, 256)
#sess = tf.Session()

In [None]:
import time
stime = time.time()
for (x, y, p) in bg:
    if p:
        etime = time.time()
        print(x.shape, y.shape, p, etime-stime)
        stime = etime

## The skip-gram architecture

Skip-Gram architecture to train a word2vec model. Define two embedding matrices, one for input word (x) and other for target words in (y). Define negative sampling loss and optimize network over negative sampling loss.

In [26]:
n_words = len(bg.word_to_id)
embedding_dim = 128
batch_size = bg.batch_size
n_neg_samples = 8
window_size = bg.window_size

sampler = tf.distributions.Categorical(probs=bg.get_negative_probs())
with tf.name_scope('Word2Vec-skipgram'):
    with tf.name_scope('Inputs'):
        inputs = tf.placeholder(dtype=tf.int32, shape=(None, 1), name='input-word')
        targets = tf.placeholder(dtype=tf.int32, shape=(None, 2*window_size), name='output-words')
        neg_samples = sampler.sample((bg.batch_size, n_neg_samples))
    with tf.name_scope('Embeddings'):
        U = tf.Variable(tf.truncated_normal((n_words, embedding_dim)), name='U')
        V = tf.Variable(tf.truncated_normal((n_words, embedding_dim)), name='V')
        B = tf.Variable(tf.constant(0., shape=(n_words,)), name='B')
        hist_u = tf.summary.histogram('U', U)
        hist_v = tf.summary.histogram('V', V)
        hist_b = tf.summary.histogram('B', B)
    with tf.name_scope('Score'):
        E1 = tf.reshape(tf.nn.embedding_lookup(U, inputs), (-1, 1, embedding_dim))
        E2 = tf.transpose((tf.nn.embedding_lookup(V, targets)), perm=[0, 2, 1])
        E3 = tf.transpose(tf.negative(tf.nn.embedding_lookup(V, neg_samples)), perm=[0, 2, 1])
        B2 = tf.expand_dims(tf.nn.embedding_lookup(B, targets), 1)
        B3 = tf.expand_dims(tf.nn.embedding_lookup(B, neg_samples), 1)
    with tf.name_scope('Loss'):
        pos_loss = tf.reduce_sum(tf.log(tf.nn.sigmoid(tf.matmul(E1, E2) + B2)), axis=2)
        neg_loss = tf.reduce_sum(tf.log(tf.nn.sigmoid(tf.matmul(E1, E3) + B3)), axis=2)
        loss = tf.negative(tf.add(tf.reduce_mean(pos_loss, axis=0), tf.reduce_mean(neg_loss, axis=0)))
        loss_summary = tf.summary.scalar('loss', loss[0])
    with tf.name_scope('Optimizer'):
        global_step = tf.Variable(0, name='global_step')
        opt = tf.train.AdamOptimizer().minimize(loss, global_step=global_step)
        merged_op = tf.summary.merge([loss_summary, hist_u, hist_v, hist_b])

### A slightly different architecture

In [423]:
n_words = len(bg.word_to_id)
embedding_dim = 256
batch_size = bg.batch_size
n_neg_samples = 8
window_size = bg.window_size

with tf.name_scope('Word2Vec-skipgram'):
    with tf.name_scope('Inputs'):
        inputs = tf.placeholder(dtype=tf.int64, shape=(None, 1), name='input-word')
        targets = tf.placeholder(dtype=tf.int64, shape=(None, 1), name='output-words')
        neg_samples, _, _ = tf.nn.fixed_unigram_candidate_sampler(
            true_classes=targets, num_true=1, num_sampled=n_neg_samples, unique=True,
            range_max=n_words, unigrams=bg.get_negative_probs(), distortion=0.75)
    with tf.name_scope('Embeddings'):
        U = tf.Variable(tf.truncated_normal((n_words, embedding_dim)), name='U')
        V = tf.Variable(tf.truncated_normal((n_words, embedding_dim)), name='V')
    with tf.name_scope('Score'):
        E1 = tf.reshape(tf.nn.embedding_lookup(U, inputs), (-1, embedding_dim))
        E2 = tf.reshape(tf.nn.embedding_lookup(V, targets), (-1, embedding_dim))
        E3 = tf.negative(tf.nn.embedding_lookup(V, neg_samples))
    with tf.name_scope('Loss'):
        pos_loss = tf.reduce_sum(tf.multiply(E1, E2), axis=1)
        neg_loss = tf.reduce_sum(tf.nn.sigmoid(tf.matmul(E1, tf.transpose(E3))), axis=1)
        loss = tf.negative(tf.add(tf.reduce_mean(pos_loss, axis=0), tf.reduce_mean(neg_loss, axis=0)))
        loss_summary = tf.summary.scalar('loss', loss)
    with tf.name_scope('Optimizer'):
        global_step = tf.Variable(0, name='global_step')
        opt = tf.train.AdamOptimizer().minimize(loss, global_step=global_step)
        merged_op = tf.summary.merge([loss_summary])

In [424]:
#sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [425]:
# Writer has to be initialzed before all variables are initialized.
writer = tf.summary.FileWriter('train/NCE-%d-%d-%d' % (embedding_dim, batch_size, n_neg_samples), sess.graph)
summary_steps = 20
checkpoint_steps = 1
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

checkpoint_path = './checkpoint/NCE-%d-%d-%d/' % (embedding_dim, batch_size, n_neg_samples)
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

max_epoch = 6
epoch = 0
step = 0
#try:
for (x, y, p) in bg:
    _, step_loss = sess.run([opt, loss], feed_dict={inputs:x, targets:y})
    if p:
        print("Epoch %d" % epoch)
        epoch += 1
    if step % summary_steps == 0:
        merged = sess.run([merged_op], feed_dict={inputs:x, targets:y})[0]
        print("Step: %d, Loss: %.3f" % (step, step_loss))
        writer.add_summary(merged, step)
    step += 1
    if epoch % checkpoint_steps == 0:
        saver.save(sess, '%s/word2vec-%d.ckpt' % (checkpoint_path, epoch))
    if epoch >= max_epoch:
        saver.save(sess, '%s/word2vec-%d.ckpt' % (checkpoint_path, epoch))
        break
#except Exception:
#    saver.save(sess, '%s/word2vec-%d.ckpt' % (checkpoint_path, epoch))

writer.close()

Step: 0, Loss: -4.112
Epoch %d 0
Epoch %d 1
Epoch %d 2
Epoch %d 3
Epoch %d 4
Epoch %d 5


In [421]:
#(x, y, _) = bg.__next__()
#np.squeeze(x.reshape(1, -1)).reshape(-1, 1)
#x.dtype, x.shape
tf.reset_default_graph()
sess.close()

## SimLex - 999 Testing

### Load the pre-trained model graph

In [293]:
g = tf.Graph()
sess = tf.Session(graph=g)

#best_model = './checkpoint/Trial 3/NCE-300-256-10/word2vec-20.ckpt'
saver = tf.train.import_meta_graph('./checkpoint/Trial 4/NCE-300-256-10/word2vec-10.ckpt.meta', graph=g)

# Restore checkpoint
saver.restore(sess, './checkpoint/Trial 4/NCE-300-256-10/word2vec-20.ckpt')

# Get embeddings matrix.
graph = sess.graph
U = graph.get_tensor_by_name('Word2Vec-skipgram/Embeddings/U:0')
V = graph.get_tensor_by_name('Word2Vec-skipgram/Embeddings/V:0')

Wu = np.array(sess.run([U])[0])
Wv = np.array(sess.run([V])[0])
Wu = Wu / np.linalg.norm(Wu, axis=1).reshape(-1, 1)
Wv = Wv / np.linalg.norm(Wv, axis=1).reshape(-1, 1)

INFO:tensorflow:Restoring parameters from ./checkpoint/Trial 4/NCE-300-256-10/word2vec-20.ckpt


In [238]:
Wu.shape

(10428, 300)

### Load SimLex - 999 data

In [239]:
f = open('./SimLex-999/SimLex-999.txt', 'r')
pairs = [[line.split('\t')[0], line.split('\t')[1], float(line.split('\t')[3])] for line in f.readlines()[1:]]
f.close()

In [240]:
bg = BatchGenerator.load_from_metadata('bg-data-5.json', reuters, batch_size=128)

get_idx = lambda x: bg.word_to_id[x] if x in bg.word_to_id else 0
pair_ids = []
for p in pairs:
    if p[0] in bg.word_to_id and p[1] in bg.word_to_id:
        pair_ids.append([get_idx(p[0]), get_idx(p[1]), p[2]])

In [354]:
word_to_vec_scores = []
for p in pair_ids:
    word_to_vec_scores.append([np.dot(Wu[p[0]], Wu[p[1]]), p[2]])

word_to_vec_scores = np.array(word_to_vec_scores).astype(np.float32)
x = word_to_vec_scores[:,0]
y = word_to_vec_scores[:,1]

l = spearmanr(x, y)

In [356]:
print(l.correlation)

0.15695845562342065


In [236]:
tf.reset_default_graph()
sess.close()

## Word Analogy

Evaluate model accuracy on word analogy task.

In [302]:
id_to_word = dict(zip(bg.word_to_id.values(), bg.word_to_id.keys()))
def top_k_closest_words(wrd, k=8):
    idx = bg.word_to_id[wrd]
    emb = Wu[idx]
    idcs = np.argsort(np.abs(np.dot(emb, Wu.T)))[-k:]
    return [id_to_word[i] for i in idcs[::-1]]

def word_analogy(wrd1, wrd2, wrd3, k=1):
    idx1 = bg.word_to_id[wrd1]
    idx2 = bg.word_to_id[wrd2]
    idx3 = bg.word_to_id[wrd3]
    emb = Wu[idx1] - Wu[idx2] + Wu[idx3]
    idcs = np.argsort(np.abs(np.dot(emb, Wu.T)))[-k:]
    return [id_to_word[i] for i in idcs[::-1]]

def sim(wrd1, wrd2):
    idx1 = bg.word_to_id[wrd1]
    idx2 = bg.word_to_id[wrd2]
    return np.dot(Wu[idx1], Wu[idx2])

In [None]:
f = open('questions-words.txt', 'r')
data = [line.split() for line in f.readlines() if not line.startswith(':')]
syntactic = data[:8869]
semantic = data[8869:]

def get_accuracy(data, bg):
    correct = 0
    cnt = 0
    for d in data:
        if all([d[0].lower() in bg.word_to_id, d[1].lower() in bg.word_to_id, d[2].lower() in bg.word_to_id, d[3].lower() in bg.word_to_id]):
            cnt += 1
            if d[3].lower() in word_analogy(d[0].lower(), d[1].lower(), d[2].lower()):
                correct +=1
    return correct, cnt


get_accuracy(data, bg)

Qualitative results for word analogy taks.

In [353]:
#word_analogy('male', 'female', '')
# Biases transport (cheap vs costly), partnership (male vs female), income (male vs female), professor (banking), reporters (male vs female), nations (developed vs developing)
# Professional (male vs female), engineering (man vs female), producer (male vs female), rich (male vs female), loving (male vs female)
# General male vs female is quite interesting.
#top_k_closest_words('male')
sim('bank', 'money'), sim('bank', 'welfare')
#word_analogy('sports', 'team', 'army')
# Quantitative word analogy
# Man, Engineering, Female, Secretariat
# Male, Loving, Female, Dramatic
# Fianance, Bank, Court, Smuggling
# Banker, Bank, Professor, Science
# Barclays (men), income (tax), engineering (appliances, equipment), court (lawsuite, judge)

(0.20896176, 0.14953859)

## Plotting Graphs for various values of hyperparameters

General function for plotting graphs.

In [107]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

def parse_file(fname):
    with open(fname, 'r') as f:
        data = [float(line.split()[-1]) for line in f.readlines()]
    return data

def plot(fnames, name, mode=None):
    data = {l: parse_file(fname) for l, fname in fnames.items()}
    if mode == 'batch size':
        data = {d: data[d][::(2-i) * 2 if (2-i) * 2 > 0 else 1] for i, d in enumerate(data)}
    for d in data:
        plt.plot(range(len(data[d])), data[d], label=d)
    plt.legend()
    plt.xlabel('steps')
    plt.ylabel('loss')
    plt.title(name)
    plt.savefig('%s-plot.png' % (name))
    plt.close()

In [93]:
files = {
    'Batch Size': {
        'batch size 128': './Reports/Batch Size/Word2Vec-batch-128.txt',
        'batch size 256': './Reports/Batch Size/Word2Vec-batch-256.txt',
        'batch size 512': './Reports/Batch Size/Word2Vec-batch-512.txt'
    },
    'Skip Window': {
        'windows size 3': './Reports/Skip window/Word2Vec-skip-window-3.txt',
        'windows size 6': './Reports/Skip window/Word2Vec-skip-window-6.txt',
        'windows size 10': './Reports/Skip window/Word2Vec-skip-window-10.txt',
        
    },
    'Negative Samples': {
        'neg samples 8': './Reports/Negative Samples/Word2Vec_neg_samples-8.txt',
        'neg samples 12': './Reports/Negative Samples/Word2Vec_neg_samples-12.txt',
        'neg samples 20': './Reports/Negative Samples/Word2Vec_neg_samples-20.txt',
        
    },
    'Embedding Dims': {
        'Embedding Dim 128': './Reports/Embedding dims/Word2Vec-embedding-dim-128.txt',
        'Embedding Dim 256': './Reports/Embedding dims/Word2Vec-embedding-dim-256.txt',
        'Embedding Dim 300': './Reports/Embedding dims/Word2Vec-embedding-dim-300.txt',
    }
}


In [108]:
plot(files['Batch Size'], 'Batch Size', mode='batch size')
plot(files['Skip Window'], 'Skip window')
plot(files['Negative Samples'], 'Negative Samples')
plot(files['Embedding Dims'], 'Embedding Dims')