In [1]:
# 使用 tensorflow 实现 word2vec

import tensorflow as tf
import process_data
import numpy as np
import random

In [2]:
# 数据探索
DOWNLOAD_URL = 'http://mattmahoney.net/dc/'
EXPECTED_BYTES = 31344016
DATA_FOLDER = './data/'
FILE_NAME = 'text8.zip'

## parameters
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 2000 # how many steps to skip before reporting the loss

process_data.download(FILE_NAME, EXPECTED_BYTES, DATA_FOLDER, DOWNLOAD_URL)

数据集已经存在于: ./data/text8.zip


'./data/text8.zip'

In [3]:
words = process_data.read_data(DATA_FOLDER + FILE_NAME)

 anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic instituti

In [4]:
len(words)

17005207

In [5]:
vocab_to_int, int_to_vocab = process_data.build_vocab(words, VOCAB_SIZE)

In [6]:
int_vocab = [vocab_to_int[word] if word in vocab_to_int else 0 for word in words]

In [7]:
def generate_sample(index_words, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target
            
            
single_gen = generate_sample(int_vocab, SKIP_WINDOW)
single_gen

<generator object generate_sample at 0x7f58bd39c308>

In [8]:
def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch
        
batch_gen = get_batch(single_gen, BATCH_SIZE)
batch_gen

<generator object get_batch at 0x7f58bd39cba0>

In [9]:
# 定义图(Graph)

## 1.定义输入和输出
with tf.name_scope('data'):
    center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE], name='center_words')
    target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1], name='target_words')

## 2.定义权重
with tf.name_scope('embedding_matrix'):
    embed_matrix = tf.Variable(tf.random_normal([VOCAB_SIZE, BATCH_SIZE], -1.0, 1.0), name='embed_matrix')

with tf.name_scope('loss'):
    ## 3.实现 embedding_lookup()
    embed = tf.nn.embedding_lookup(embed_matrix, center_words)

    ## 4.定义 LOSS 函数

    ### 4.1 定义 weights 和 bias 使得隐藏层可以计算 NCE loss
    nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / EMBED_SIZE ** 0.5))
    nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]))

    ### 4.2 实现 nce_loss
    loss = tf.reduce_mean(tf.nn.nce_loss(
        weights = nce_weight,
        biases = nce_bias,
        labels = target_words,
        inputs = embed,
        num_sampled = NUM_SAMPLED,
        num_classes = VOCAB_SIZE
    ))

## 5.定义优化器
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

In [10]:
# 在 SESSION 中计算图（运行模型）
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    writer = tf.summary.FileWriter('./data/no_frills/', sess.graph)
    total_loss = 0.0
    
    for index in range(NUM_TRAIN_STEPS):
            centers, targets = next(batch_gen)
            loss_batch, _ = sess.run([loss, optimizer], 
                                    feed_dict={center_words: centers, target_words: targets})
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
    
    writer.close()

Average loss at step 1999: 121.4
Average loss at step 3999:  66.5
Average loss at step 5999:  38.3
Average loss at step 7999:  26.9
Average loss at step 9999:  20.4
