In [1]:
import tensorflow as tf
import numpy as np
import collections
import random

In [2]:
sentences=['该 词向量 数据 包含 很多 现有 公开 的 词向量 数据 所 欠缺 的 短语',
           '墨玉 河 和田 河 玉龙喀什 河 白玉 河 喀什 河 叶尔羌 河 克里雅 河 玛纳斯 河',
           '腾讯 此次 公开 的 中文 词向量 数据 包含 中文 词汇 其中 每个 词 对应 一个 向量'
           '该 词向量 数据 包含 很多 现有 公开 的 词向量 数据 所 欠缺 的 短语'
          ]

In [3]:
def build_vocab(sentences,vocab_size):
    count=[['UNK',-1]]
    word_list = ' '.join(sentences).split(' ')
    count.extend(collections.Counter(word_list).most_common(vocab_size-1))
    vocab=dict()
    for word,_ in count:
        vocab[word]=len(vocab)
    data=list()
    unk_count=0
    #假设整个语料都是连贯的
    for word in word_list:
        if word in vocab:
            index=vocab[word]
        else:
            index=0
            unk_count+=1
        data.append(index)
    count[0][1]=unk_count
    index2word=dict(zip(vocab.values(),vocab.keys()))
    return data,count,vocab,index2word

In [4]:
def generate_batch(batch_size,num_skips,window):
    global data_index
    assert batch_size%num_skips==0
    assert num_skips<=2*window
    batch=np.ndarray(shape=(batch_size),dtype=np.int32)
    labels=np.ndarray(shape=(batch_size,1),dtype=np.int32)
    span=2*window+1
    buffer=collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index=(data_index+1)%len(data)
    for i in range(batch_size//num_skips):
        context=window
        targets2avoid=[window]
        for j in range(num_skips):
            while context in targets2avoid:
                context=random.randint(0,span-1)
            targets2avoid.append(context)
            batch[i*num_skips+j]=buffer[window]
            labels[i*num_skips+j,0]=buffer[context]
        #因为最大长度为span，最后插入一个的同时头部也会弹出一个
        buffer.append(data[data_index])
        data_index=(data_index+1)%len(data)
    return batch,labels

In [5]:
data, count, vocab, index2word = build_vocab(sentences,vocab_size=20)
del sentences

In [6]:
data_index = 0
batch, labels = generate_batch(batch_size=12, num_skips=6, window=3)
for i in range(12):
    print(batch[i], index2word[batch[i]],
      '->', labels[i, 0], index2word[labels[i, 0]])

5 包含 -> 2 词向量
5 包含 -> 13 该
5 包含 -> 3 数据
5 包含 -> 7 很多
5 包含 -> 8 现有
5 包含 -> 6 公开
7 很多 -> 5 包含
7 很多 -> 8 现有
7 很多 -> 4 的
7 很多 -> 2 词向量
7 很多 -> 3 数据
7 很多 -> 6 公开


In [8]:
vocabulary_size=20
unigrams = [ c / vocabulary_size for token, c in count ]
batch_size = 12
embedding_size = 10  
window = 2      
num_skips = 4  
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

In [11]:
graph = tf.Graph()
with graph.as_default():
    train_inputs=tf.placeholder(tf.int32,shape=[batch_size])
    train_labels=tf.placeholder(tf.int32,shape=[batch_size,1])
    valid_dataset=tf.constant(valid_examples,dtype=tf.int32)
    input_ids=train_inputs
    labels=tf.reshape(train_labels,[batch_size])
    # [vocabulary_size, emb_dim] - input vectors
    input_vectors = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0),
        name="input_vectors")
    # [vocabulary_size, emb_dim] - output vectors
    output_vectors = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0),
        name="output_vectors")
    label_matrix=tf.reshape(tf.cast(labels,dtype=tf.int64),[batch_size,1])
    #负采样,其中labels_matrix为正确的输出词，采样的时候会跳过这些词，num_sampled为采样个数，distortion即为公式(3-4)中的幂指数
    sampled_ids,_,_=(tf.nn.fixed_unigram_candidate_sampler(
        true_classes=label_matrix,
        num_true=1,
        num_sampled=6,
        unique=True,
        range_max=vocabulary_size,
        distortion=0.75,
        unigrams=unigrams))
    center_vects=tf.nn.embedding_lookup(input_vectors,input_ids)
    context_vects=tf.nn.embedding_lookup(output_vectors,labels)
    sampled_vects=tf.nn.embedding_lookup(output_vectors,sampled_ids)
    incorpus_logits=tf.reduce_sum(tf.multiply(center_vects,context_vects),1)
    incorpus_probabilities=tf.nn.sigmoid(incorpus_logits)
    sampled_logits=tf.matmul(center_vects,sampled_vects,transpose_b=True)
    outcorpus_probabilities=tf.nn.sigmoid(-sampled_logits)
    #有点疑问
    outcorpus_loss_persample=tf.reduce_sum(tf.log(outcorpus_probabilities),1)
    loss_persample=-tf.log(incorpus_probabilities)-outcorpus_loss_persample
    loss=tf.reduce_sum(loss_persample)/batch_size
    
    optimizer = tf.train.GradientDescentOptimizer(.4).minimize(loss)
    init = tf.initialize_all_variables()
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(input_vectors + output_vectors), 1, keep_dims=True))
    normalized_embeddings = (input_vectors + output_vectors) / norm

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [12]:
num_steps = 1000
with tf.Session(graph=graph) as session:
    init.run()
    average_loss=0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, window)
        feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
    final_embeddings = normalized_embeddings.eval()

In [13]:
final_embeddings

array([[ 0.30235264, -0.04517799,  0.59678119,  0.40015423, -0.06895108,
        -0.22758423, -0.27130318, -0.28929746, -0.37225044,  0.1945584 ],
       [-0.17173713,  0.64888752,  0.25322616,  0.24661313,  0.30968028,
        -0.26631936,  0.32117972, -0.00290067, -0.01225195,  0.39289582],
       [ 0.27336422, -0.70834923, -0.15362176, -0.20753181, -0.20081806,
        -0.25580165, -0.42221516, -0.10078827, -0.08484125, -0.23549929],
       [ 0.08434121, -0.55544859, -0.15186371, -0.26071402, -0.50537533,
         0.10399426, -0.08743931, -0.2068812 ,  0.36351934, -0.38015571],
       [ 0.31436932, -0.35464245, -0.49916646, -0.57809579, -0.14422184,
        -0.02473691, -0.16658098,  0.21907637, -0.243789  , -0.18828212],
       [ 0.19494556, -0.39189124,  0.11740357,  0.10454261, -0.49063367,
         0.53153014, -0.19069931, -0.31554052,  0.264925  , -0.23311336],
       [ 0.24521826, -0.20055458,  0.20772356, -0.10521682, -0.37379035,
         0.49444723, -0.34406686,  0.02753356