# jupyter tensorflow实现word2vec

In [2]:
import collections
import os
import random
import urllib
import zipfile

import numpy as np
import tensorflow as tf

## 参数设置

In [3]:
# 训练参数
learning_rate=0.1
batch_size = 128
num_steps=3000000
display_step=10000
eval_step=200000

# 测试样例
eval_words = ['nine','of','going','hardware','britain']

embedding_size = 300
max_vocabulary = 50000
min_occurrence = 10 #最小词频
skip_window = 3
num_skips = 2 # 一个窗口制作多少样本对
num_sampled = 64 # 负采样，不需要计算所有样本的概率，提高速度

## 导入维基百科数据集

In [4]:
data_path = '../study-before-work/paddle_NLP/text8.zip'
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        #  tf.compat.as_str：将bytes 转为 str
        data = tf.compat.as_str(f.read(f.namelist()[0])).split() # 根据空格划分字符串数据
        return data # list形式的数据
        
words = read_data(data_path)
print('Data Size:',len(words)) # Data Size: 17005207

Data Size: 17005207


## 词频计算

In [7]:
count = [['UNK',-1]]
count.extend(collections.Counter(words).most_common(max_vocabulary-1))
for i in range(len(count)-1,-1,-1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        break

## 词-ID映射

In [9]:
# 去除了词频小于10的，所以不足五万个词
vocabulary_size = len(count)
word2id = dict()
for i, (word,_) in enumerate(count):
    word2id[word] = i

In [10]:
word2id

{'UNK': 0,
 'the': 1,
 'of': 2,
 'and': 3,
 'one': 4,
 'in': 5,
 'a': 6,
 'to': 7,
 'zero': 8,
 'nine': 9,
 'two': 10,
 'is': 11,
 'as': 12,
 'eight': 13,
 'for': 14,
 's': 15,
 'five': 16,
 'three': 17,
 'was': 18,
 'by': 19,
 'that': 20,
 'four': 21,
 'six': 22,
 'seven': 23,
 'with': 24,
 'on': 25,
 'are': 26,
 'it': 27,
 'from': 28,
 'or': 29,
 'his': 30,
 'an': 31,
 'be': 32,
 'this': 33,
 'which': 34,
 'at': 35,
 'he': 36,
 'also': 37,
 'not': 38,
 'have': 39,
 'were': 40,
 'has': 41,
 'but': 42,
 'other': 43,
 'their': 44,
 'its': 45,
 'first': 46,
 'they': 47,
 'some': 48,
 'had': 49,
 'all': 50,
 'more': 51,
 'most': 52,
 'can': 53,
 'been': 54,
 'such': 55,
 'many': 56,
 'who': 57,
 'new': 58,
 'used': 59,
 'there': 60,
 'after': 61,
 'when': 62,
 'into': 63,
 'american': 64,
 'time': 65,
 'these': 66,
 'only': 67,
 'see': 68,
 'may': 69,
 'than': 70,
 'world': 71,
 'i': 72,
 'b': 73,
 'would': 74,
 'd': 75,
 'no': 76,
 'however': 77,
 'between': 78,
 'about': 79,
 'over': 80

## 所有词转换成id

In [12]:
data = list()
unk_count = 0
for word in words:
    index = word2id.get(word,0)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0][1] = unk_count
id2word = dict(zip(word2id.values(),word2id.keys()))

print("Words count:",len(words))
print("Unique words:",len(set(words)))
print("Vocabulary words:",vocabulary_size)
print("Most common words:",count[:10])

Words count: 17005207
Unique words: 253854
Vocabulary words: 47135
Most common words: [['UNK', 444176], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]


## 构建训练数据集

In [13]:
data_index = 0
def generate_batch(batch_size,num_skips,skip_window):
    """
    example： "the quick brown for jumped over the lazy dog"
    skip_window = 1时，quick与其前后一个单词相关，可以生成两个样本: quick->the,quick->brown
    params:
    - batch_size:必须是num_skips的倍数，保证每个batch包含一个单词对应的所有的样本
    - num_skips:每个单词可生成的样本数，不超过2*skip_window
    - skip_window:窗口大小
    
    """
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size),dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1),dtype=np.int32)    
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index+1) % len(data) # data保存的是每个词在词典中的索引,unk的索引一律为0
    for i in range(batch_size//num_skips):
        target = skip_window # 中心词
        targets_to_avoid = [skip_window]
        for j in range(num_skips): # 每个词生成num_skips个样本
            while target in targets_to_avoid:
                target = random.randint(0,span-1) #含头且含尾
            targets_to_avoid.append(target)
            batch[i*num_skips+j] = buffer[skip_window] # 存放的是中心词(的索引)
            labels[i*num_skips+j,0] = buffer[target] # 存放的是中心词要预测的词(的索引)
        if data_index == len(data):
            buffer.extent(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index]) # 因为buffer的大小是固定的，会挤掉一个词；如从2 43 212->43 212 121
            data_index = data_index+1 
    return batch,labels

In [14]:
# 可能要用到的参数
with tf.device('/cpu:0'):
    embeddings = tf.Variable(tf.random.normal([vocabulary_size,embedding_size])) # 47135x300
    nce_weights = tf.Variable(tf.random.normal([vocabulary_size,embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

## 通过tf.nn.embedding_lookup函数将索引转换成词向量

In [15]:
def get_embedding(x):
    with tf.device('/cpu:0'):
        x_embed = tf.nn.embedding_lookup(embeddings,x)
        return x_embed

## 损失函数定义

- 先分别计算正样本和采样的负样本对应的output和label
- 再通过sigmoid cross entropy来计算output和label的loss

In [19]:
def nce_loss(x_embed,y):
    with tf.device('/cpu:0'):
        y = tf.cast(y,tf.int64)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
            weights=nce_weights,
            biases=nce_biases,
            labels=y,
            inputs=x_embed,
            num_sampled=num_sampled,
            num_classes=vocabulary_size))
        return loss

In [34]:
def evaluation(x_embed):
    with tf.device('/cpu:0'):
        x_embed = tf.cast(x_embed,tf.float32)
        x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed))) #归一化
        embedding_norm = embeddings / tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keepdims=True),tf.float32)
        cosine_sim = tf.matmul(x_embed_norm,embedding_norm,transpose_b=True)
        return cosine_sim
    
optimizer = tf.optimizers.SGD(learning_rate)

In [26]:
def run_optimizer(x,y):
    with tf.device('/cpu:0'):
        with tf.GradientTape() as g:
            emb = get_embedding(x) # x是一个batch的输入
            loss = nce_loss(emb,y)
        
        # 计算梯度
        gradients = g.gradient(loss,[embeddings,nce_weights,nce_biases])
        
        #更新梯度
        optimizer.apply_gradients(zip(gradients,[embeddings,nce_weights,nce_biases]))

In [None]:
# 测试
x_test = np.array([word2id[w] for w in eval_words])

# 训练
for step in range(1,num_steps+1):
    batch_x,batch_y = generate_batch(batch_size,num_skips,skip_window)
    run_optimizer(batch_x,batch_y)
    
    if step% display_step == 0 or step == 1:
        loss = nce_loss(get_embedding(batch_x),batch_y)
        print("step:%d,loss:%f" % (step,loss))
    
    if step % eval_step == 0 or step == 1:
        print("Evaluation...")
        sim = evaluation(get_embedding(x_test)).numpy()
        for i in range(len(eval_words)):
            top_k = 8
            nearest = (-sim[i,:]).argsort()[1:top_k+1]
            log_str = "%s nearest neighbors:" % eval_words[i]
            for k in range(top_k):
                log_str = "%s %s" % (log_str,id2word[nearest[k]])
            print(log_str)

step:1,loss:619.272095
Evaluation...
nine nearest neighbors: inductors infested notification quirk marble stocks anyone manu
of nearest neighbors: widgets cocker knapp vegetal mile hells wen malaysians
going nearest neighbors: diarist interferon indefinite paste insane enriched crass breton
hardware nearest neighbors: mak mmol tollens infinitives agnostics fried nickel valdez
britain nearest neighbors: chinon tastes stabilise hominid salts li fukuoka fb
