In [9]:
import collections
import math
import matplotlib as plt
import os
import random
from tempfile import gettempdir
import zipfile
import numpy as np
import urllib
import tensorflow as tf

<img src="https://www.researchgate.net/profile/Wang_Ling/publication/281812760/figure/fig1/AS:613966665486361@1523392468791/Illustration-of-the-Skip-gram-and-Continuous-Bag-of-Word-CBOW-models.png" width="500">

# Implement Wordvec with `numpy`

## Preparation

Reference: http://nbviewer.jupyter.org/github/fbkarsdorp/doc2vec/blob/master/doc2vec.ipynb

- Define some sample sentences

In [213]:
sentences = ['the king loves the queen', 'the queen loves the king',
             'the dwarf hates the king', 'the queen hates the dwarf',
             'the dwarf poisons the king', 'the dwarf poisons the queen']

- Define vocabulary mapping

In [214]:
from collections import defaultdict

def Vocabulary():
    dictionary = defaultdict()
    dictionary.default_factory = lambda: len(dictionary)
    return dictionary

def docs2bow(docs, dictionary):
    for doc in docs:
        yield [dictionary[word] for word in doc.split()]

- Add index to each **word** in **document**

In [215]:
vocabulary = Vocabulary()
sentences_bow = list(docs2bow(sentences, vocabulary))
sentences_bow

[[0, 1, 2, 0, 3],
 [0, 3, 2, 0, 1],
 [0, 4, 5, 0, 1],
 [0, 3, 5, 0, 4],
 [0, 4, 6, 0, 1],
 [0, 4, 6, 0, 3]]

In [216]:
print('The index of word `queen` is: ',vocabulary['queen'])

The index of word `queen` is:  3


In [217]:
print('There is a total of ', len(vocabulary), ' words in the vocabulary')

There is a total of  7  words in the vocabulary


Ref: http://www.claudiobellei.com/2018/01/06/backprop-word2vec/

## Neural network and Back-propagation

- Define calculation of probability
    - $$ P(u_O|v_I) = \frac{exp(u_I v_O^T)}{\sum_{w}exp(u_I v_j^T)}$$
    - $w$ is entire vocabulary
    - $u$ = Input-hiddern matrix
    - $v$ = hidden-output matrix


In [218]:
def u(word, WI):
    return WI[vocabulary[word]]

def v(word, WO):
    return WO[vocabulary[word]]

def v_T(word, WO):
    return WO.T[vocabulary[word]]

In [219]:
# p(word2|word1), where word 1 is context, word 2 is target
def p(word1, word2, U, V):
    return np.exp(  np.dot(u(word1, U), 
                           v_T(word2, V)
                          )
                 ) / \
                sum(np.exp(
                        np.dot(u(word1, U), 
                               v_T(w, V)
                              )
                          ) 
                    for w in vocabulary
                    )

- Define loss function
    - $i$ is index of sample.$$Loss = -\sum_{i} log (P(w_O|w_I)) = -\sum_{i}log(P_O|I)$$


    
    
- Define update for hidden-output matrix $v$    
    - For words other than $O$: $$\frac{\partial L}{\partial v_j} = -\frac{1}{P} \frac{\partial P}{\partial v_j} = u_I P$$
    - For word $O$:$$\frac{\partial L}{\partial v_O} = -\frac{1}{P} \frac{\partial P}{\partial v_O} = u_I (P-1)$$
    - Update rule: $$v = v - \mu  \frac{\partial L}{\partial v}$$




- Define update for input-hidden matrix $u$    
  - $$\frac{\partial L}{\partial u_I} = -\frac{1}{P} \frac{\partial P}{\partial u_I} = \sum_{j \neq O}{v_j P} + v_O (P-1)$$
  - Update rule:$$u_I = u_I - \mu \frac{\partial L}{\partial u_I}$$

In [224]:
def gradient_WO(input_word, target_word, word, U, V):
    p_word = p(input_word, word, U, V)
    t = 1 if word == target_word else 0
    error = p_word - t
    
    return error * u(input_word, U)

In [225]:
def gradient_WI(input_word, target_word, word, U, V):
    g = 0
    for word in vocabulary:
        p_word = p(input_word, word, U, V)
        t = 1 if word == target_word else 0
        error = p_word - t
        g += error * v_T(word, V)
    
    return g

## Test with a word pair

In [230]:
#test
target_word = 'king'
input_word = 'queen'
learning_rate = 1

In [231]:
#test
V, N = len(vocabulary), 3
WI = (np.random.random((V, N)) - 0.5) / N
WO = (np.random.random((N, V)) - 0.5) / V
p('queen','king',WI, WO)

0.14402139850570186

In [232]:
for i in range(6):
    WO_ = WO
    WI_ = WI
    
    # update v_j for each j
    for word in vocabulary:
        WO.T[vocabulary[word]] = WO.T[vocabulary[word]] - \
                                 learning_rate * gradient_WO(input_word, target_word, word, WI_, WO_)
    
    # update u_I
    WI[vocabulary[input_word]] = WI[vocabulary[input_word]] - \
                                 learning_rate * gradient_WI(input_word, target_word, word, WI_, WO_)
    
    print(p('queen','king',WI, WO))

0.15318926807664865
0.2142539254707067
0.6300275186571097
0.9124661578311665
0.9547324038771844
0.9703354033408761


## Multi-word context

<img src="http://www.claudiobellei.com/2018/01/06/backprop-word2vec/CBOW_backprop.png" width=700>

- The only changes are:
    - For updating $V$: $$h_I = \frac{1}{C}(u_1+...+u_{Ik}+...u_C)$$
    
    - For updating $U$: $$u_{Ik} = u_{Ik} - \mu \frac{1}{C}\frac{\partial L}{\partial h_I}$$
 


## Paragraph vec

<img src="http://nbviewer.jupyter.org/github/fbkarsdorp/doc2vec/blob/master/doc2vec.png" width=300>

- Think of $D_k$ in document $k$ as another word vector to train in a multi-word context.
    $$h_I = \frac{1}{C}(D_k + u_1+...+u_{Ik}+...u_C)$$
    

# Build a `Skip-Gram` model with tensorflow

## Import data as list of string

In [2]:
url = 'http://mattmahoney.net/dc/'
def download(filename):
    local_filename = os.path.join(gettempdir(), filename)
    if not os.path.exists(local_filename):
        local_filename, _ = urllib.request.urlretrieve(url + filename,local_filename)
    return local_filename

In [3]:
filename = download('text8.zip')

## Read the data into Vocabulary list

In [4]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data
vocabulary = read_data(filename)
print('Vocabulary size', len(vocabulary))

Vocabulary size 17005207


In [5]:
vocabulary[:5]

['anarchism', 'originated', 'as', 'a', 'term']

## Build `Count`, `Dictionary`, `Data`, etc.

In [6]:
def build_dataset(words, n_words):
    
    #### count --> [['UNK',?], (word1, count1), ....]
    
    count = [['UNK', -1]] #replace rare words with UNK
    count.extend(collections.Counter(words).most_common(n_words - 1)) 
    
    #### dictionary --> {'word1': index1}
    
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    # data --> [index1, index2, ...] 
    
    data = []
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # 'UNK'
            unk_count += 1
        data.append(index)
    
    #### count --> [['UNK',count_0], (word1, count1), ....]
    
    count[0][1] = unk_count
    
    #### reversed_dictionary --> [{index1: word1}]
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [7]:
vocabulary_size = 50000
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
del vocabulary

In [8]:
data[:3]

[5234, 3081, 12]

In [9]:
count[:3]

[['UNK', 418391], ('the', 1061396), ('of', 593677)]

In [10]:
dictionary['the']

1

In [11]:
reverse_dictionary[1]

'the'

## Prepare training batch

In [12]:
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


In [13]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

In [14]:
print('Batch: ')
print(batch)
print('Labels: ')
print(labels)

Batch: 
[3081 3081   12   12    6    6  195  195]
Labels: 
[[  12]
 [5234]
 [3081]
 [   6]
 [  12]
 [ 195]
 [   6]
 [   2]]


In [15]:
print('Format: index-middle, word-middle, index-left, word-left\n')
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', 
          labels[i, 0], reverse_dictionary[labels[i, 0]])

Format: index-middle, word-middle, index-left, word-left

3081 originated -> 12 as
3081 originated -> 5234 anarchism
12 as -> 3081 originated
12 as -> 6 a
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of


*Note that every word has 2 neighboring words*

## Build the skip-gram model.

In [16]:
batch_size = 100
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample.

In [17]:
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [18]:
valid_examples

array([11, 94, 37, 76, 71, 92, 32, 75, 87, 68, 61, 21, 53,  2, 23, 85])

### Define Graph - 1, Embedding Mapping

In [19]:
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [20]:
# initialize weight matrix between -1 and 1, size is v_size * embedding size
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

In [21]:
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
embed # Note, shape = (batch_size, embedding_size), i.e., word vec for all inputs

<tf.Tensor 'embedding_lookup:0' shape=(100, 128) dtype=float32>

### Define Graph - 2, From hidden layer to output

In [22]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss = tf.reduce_mean( #mean over batch
      tf.nn.nce_loss(weights = nce_weights,
                     biases = nce_biases,
                     labels = train_labels,
                     inputs = embed,
                     num_sampled = num_sampled,
                     num_classes = vocabulary_size))

optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

### Define Graph - 3, Add validation metrics

In [24]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) # find embedding vec for all valid in
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # calculate distance w/ all words

## Start Training

In [29]:
init = tf.global_variables_initializer()
num_steps = 50001

In [30]:
with tf.Session() as session:

    init.run()
    average_loss = 0
    
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0 and step > 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 3  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
            final_embeddings = normalized_embeddings.eval()

Average loss at step  0 :  252.850860596
Average loss at step  2000 :  112.89615527
Average loss at step  4000 :  54.0413853676
Average loss at step  6000 :  33.5787463242
Average loss at step  8000 :  23.2382255834
Average loss at step  10000 :  17.5596958621
Nearest to is: was, and, fayetteville,
Nearest to state: awards, generator, codes,
Nearest to also: mishnayot, stake, material,
Nearest to no: sourceforge, tracking, and,
Nearest to world: pyruvate, plasma, loss,
Nearest to system: lived, dryness, shakespeare,
Nearest to be: deg, hailstones, pron,
Nearest to d: repetitive, bay, jerseys,
Nearest to known: fastened, rivers, precise,
Nearest to see: annoyed, afternoons, how,
Nearest to after: and, of, libation,
Nearest to four: nine, zero, eight,
Nearest to can: sex, shooter, vera,
Nearest to of: and, in, for,
Nearest to seven: nine, zero, aquila,
Nearest to during: in, aquila, acids,
Average loss at step  12000 :  14.3223680969
Average loss at step  14000 :  11.7460552073
Average l

# Use `Word2Vec` package to train model

Reference: https://github.com/rouseguy/DeepLearning-NLP/blob/master/notebooks/2.%20word2vec.ipynb

In [1]:
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

## Import data

In [2]:
caesar_file = './data/juliuscaesar.txt'
stopword_file  = './data/long_stopwords.txt'

## Clean sentence

In [3]:
stop_wordsstop_wo  = []
with open(stopword_file,'r') as inpFile:
    lines = inpFile.readlines()
    stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
    stop_words = list(map(lambda x:  re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp))
stop_words[:5]

['a', 'able', 'about', 'above', 'abst']

In [4]:
def clean(word):
    word = word.strip()
    word = word.lower()
    word = re.sub('[^A-Za-z0-9]+', '', word)
    if word not in stop_words:
        return word
    else:
        return ''

In [9]:
print('Raw: ' + r"they'll" + ' --> Cleaned: ' + clean("they'll"))

Raw: they'll --> Cleaned: 


In [10]:
print('Raw: ' + r"King's" + ' --> Cleaned: ' + clean("King's"))

Raw: King's --> Cleaned: kings


## Extract sentence from text file

In [37]:
line_count = 0
sentences = []
          
with open(caesar_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = list(filter(lambda x:True if len(x) > 0 else False, words))
            sentences.append(words)

In [38]:
sentences[107:110]

[['second', 'commoner', 'sir', 'wear', 'shoes'],
 [],
 ['work', 'sir', 'holiday']]

## Train a Word2Vec model

In [39]:
# reference: https://radimrehurek.com/gensim/models/word2vec.html
model = Word2Vec (sentences, 
                 window=5, 
                 size=500, 
                 workers=4, 
                 min_count=5)

In [40]:
# Example output:
print('The vector length is: ' + str(len(model.wv['second'])))
print('The vector for word "second" is : ')
print(model.wv['second'][:5])

The vector length is: 500
The vector for word "second" is : 
[ -1.04607592e-04   8.00557493e-04   7.22333963e-04  -5.17591388e-06
   1.67458638e-04]


In [41]:
model.wv.most_similar(positive=['rome'])

[('brutus', 0.4171747863292694),
 ('casca', 0.3478987216949463),
 ('caesar', 0.3422013819217682),
 ('will', 0.33033287525177),
 ('thee', 0.3258987069129944),
 ('gods', 0.320721298456192),
 ('antony', 0.3149747848510742),
 ('cassius', 0.31106653809547424),
 ('good', 0.3107137680053711),
 ('lucius', 0.29914504289627075)]