word2vec

In [2]:

import numpy as np
from collections import deque

class InputData:
    def __init__(self,input_file_name,min_count):
        self.input_file_name = input_file_name
        self.index = 0
        self.input_file = open(self.input_file_name,"r",encoding="utf-8")
        self.min_count = min_count
        self.wordid_frequency_dict = dict()
        self.word_count = 0
        self.word_count_sum = 0
        self.sentence_count = 0
        self.id2word_dict = dict()
        self.word2id_dict = dict()
        self._init_dict()  # 初始化字典
        self.sample_table = []
        self._init_sample_table()  # 初始化负采样映射表
        self.get_wordId_list()
        self.word_pairs_queue = deque()
        # 结果展示
        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        print('Sentence Count is:', self.sentence_count)
    def _init_dict(self):
        word_freq = dict()
        for line in self.input_file:
            line = line.strip().split()
            self.word_count_sum +=len(line)
            self.sentence_count +=1
            for i,word in enumerate(line):
                if i%1000000==0:
                    print (i,len(line))
                if word_freq.get(word)==None:
                    word_freq[word] = 1
                else:
                    word_freq[word] += 1
        for i,word in enumerate(word_freq):
            if i % 100000 == 0:
                print(i, len(word_freq))
            if word_freq[word]<self.min_count:
                self.word_count_sum -= word_freq[word]
                continue
            self.word2id_dict[word] = len(self.word2id_dict)
            self.id2word_dict[len(self.id2word_dict)] = word
            self.wordid_frequency_dict[len(self.word2id_dict)-1] = word_freq[word]
        self.word_count =len(self.word2id_dict)
    def _init_sample_table(self):
        sample_table_size = 1e8
        pow_frequency = np.array(list(self.wordid_frequency_dict.values())) ** 0.75
        word_pow_sum = sum(pow_frequency)
        ratio_array = pow_frequency / word_pow_sum
        word_count_list = np.round(ratio_array * sample_table_size)
        for word_index, word_freq in enumerate(word_count_list):
            self.sample_table += [word_index] * int(word_freq)
        self.sample_table = np.array(self.sample_table)
        np.random.shuffle(self.sample_table)
    def get_wordId_list(self):
        self.input_file = open(self.input_file_name, encoding="utf-8")
        sentence = self.input_file.readline()
        wordId_list = []  # 一句中的所有word 对应的 id
        sentence = sentence.strip().split(' ')
        for i,word in enumerate(sentence):
            if i%1000000==0:
                print (i,len(sentence))
            try:
                word_id = self.word2id_dict[word]
                wordId_list.append(word_id)
            except:
                continue
        self.wordId_list = wordId_list
    def get_batch_pairs(self,batch_size,window_size):
        while len(self.word_pairs_queue) < batch_size:
            for _ in range(1000):
                if self.index == len(self.wordId_list):
                    self.index = 0
                wordId_w = self.wordId_list[self.index]
                for i in range(max(self.index - window_size, 0),
                                         min(self.index + window_size + 1,len(self.wordId_list))):

                    wordId_v = self.wordId_list[i]
                    if self.index == i:  # 上下文=中心词 跳过
                        continue
                    self.word_pairs_queue.append((wordId_w, wordId_v))
                self.index+=1
        result_pairs = []  # 返回mini-batch大小的正采样对
        for _ in range(batch_size):
            result_pairs.append(self.word_pairs_queue.popleft())
        return result_pairs


    # 获取负采样 输入正采样对数组 positive_pairs，以及每个正采样对需要的负采样数 neg_count 从采样表抽取负采样词的id
    # （假设数据够大，不考虑负采样=正采样的小概率情况）
    def get_negative_sampling(self, positive_pairs, neg_count):
        neg_v = np.random.choice(self.sample_table, size=(len(positive_pairs), neg_count)).tolist()
        return neg_v

    # 估计数据中正采样对数，用于设定batch
    def evaluate_pairs_count(self, window_size):
        return self.word_count_sum * (2 * window_size) - self.sentence_count * (
                    1 + window_size) * window_size




In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SkipGramModel(nn.Module):
    def __init__(self,vocab_size,embed_size):
        super(SkipGramModel,self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.w_embeddings = nn.Embedding(vocab_size,embed_size)
        self.v_embeddings = nn.Embedding(vocab_size, embed_size)
        self._init_emb()

    def _init_emb(self):
        initrange = 0.5 / self.embed_size
        self.w_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)

    def forward(self, pos_w, pos_v, neg_v):
        emb_w = self.w_embeddings(torch.LongTensor(pos_w))  # 转为tensor 大小 [ mini_batch_size * emb_dimension ]
        emb_v = self.v_embeddings(torch.LongTensor(pos_v))
        neg_emb_v = self.v_embeddings(torch.LongTensor(neg_v))  # 转换后大小 [ negative_sampling_number * mini_batch_size * emb_dimension ]
        score = torch.mul(emb_w, emb_v)

        score = torch.sum(score, dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = F.logsigmoid(score)

        neg_score = torch.bmm(neg_emb_v, emb_w.unsqueeze(2))
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = F.logsigmoid(-1 * neg_score)
        # L = log sigmoid (Xw.T * θv) + ∑neg(v) [log sigmoid (-Xw.T * θneg(v))]
        loss = - torch.sum(score) - torch.sum(neg_score)
        return loss


    def save_embedding(self, id2word, file_name):
        embedding_1 = self.w_embeddings.weight.data.cpu().numpy()
        embedding_2 = self.v_embeddings.weight.data.cpu().numpy()
        embedding = (embedding_1+embedding_2)/2
        fout = open(file_name, 'w')
        fout.write('%d %d\n' % (len(id2word), self.embed_size))
        for wid, w in id2word.items():
            e = embedding[wid]
            e = ' '.join(map(lambda x: str(x), e))
            fout.write('%s %s\n' % (w, e))


In [4]:
from skip_gram_nge_model import SkipGramModel
from input_data import InputData
import torch.optim as optim
from tqdm import tqdm


import argparse

def ArgumentParser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default="skip-gram", help="skip-gram or cbow")
    parser.add_argument("--window_size",type=int,default=3,help="window size in word2vec")
    parser.add_argument("--batch_size",type=int,default=256,help="batch size during training phase")
    parser.add_argument("--min_count",type=int,default=3,help="min count of training word")
    parser.add_argument("--embed_dimension",type=int,default=100,help="embedding dimension of word embedding")
    parser.add_argument("--learning_rate",type=float,default=0.02,help="learning rate during training phase")
    parser.add_argument("--neg_count",type=int,default=5,help="neg count of skip-gram")
    return parser.parse_args()

args = ArgumentParser()

WINDOW_SIZE = args.window_size  # 上下文窗口c
BATCH_SIZE = args.batch_size  # mini-batch
MIN_COUNT = args.min_count  # 需要剔除的 低频词 的频
EMB_DIMENSION = args.embed_dimension  # embedding维度
LR = args.learning_rate  # 学习率
NEG_COUNT = args.neg_count  # 负采样数


class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_w = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_w, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr

        self.model.save_embedding(self.data.id2word_dict, self.output_file_name)


if __name__ == '__main__':
    w2v = Word2Vec(input_file_name='../data/lxc.txt', output_file_name="skip_gram_neg.txt")
    w2v.train()


ModuleNotFoundError: No module named 'skip_gram_nge_model'

In [None]:
class HuffmanNode:
    def __init__(self, word_id, frequency):
        self.word_id = word_id
        self.frequency = frequency
        self.left_child = None
        self.right_child = None
        self.father = None
        self.Huffman_code = []
        self.path = []



class HuffmanTree:
    def __init__(self, wordid_frequency_dict):
        self.word_count = len(wordid_frequency_dict)
        self.wordid_code = dict()
        self.wordid_path = dict()
        self.root = None
        unmerge_node_list = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()]
        self.huffman = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()]
        print("Building huffman tree...")
        self.build_tree(unmerge_node_list)
        print("Building tree finished")
        # 生成huffman code
        print("Generating huffman path...")
        self.generate_huffman_code_and_path()
        print("Generating huffman path finished")

    def merge_node(self, node1, node2):
        sum_frequency = node1.frequency + node2.frequency
        mid_node_id = len(self.huffman)
        father_node = HuffmanNode(mid_node_id, sum_frequency)
        if node1.frequency >= node2.frequency:
            father_node.left_child = node1
            father_node.right_child = node2
        else:
            father_node.left_child = node2
            father_node.right_child = node1
        self.huffman.append(father_node)
        return father_node

    def build_tree(self, node_list):

        while len(node_list) > 1:
            node_list = sorted(node_list, key=lambda x: x.frequency)
            i1 = node_list[0]
            i2 = node_list[1]
            node_list.remove(i1)
            node_list.remove(i2)
            father_node = self.merge_node(i1, i2)  # 合并最小的两个节点
            node_list.append(father_node)  # 插入新节点

        self.root = node_list[0]

    def generate_huffman_code_and_path(self):
        stack = [self.root]
        while len(stack) > 0:
            node = stack.pop()
            # 顺着左子树走
            while node.left_child or node.right_child:
                code = node.Huffman_code
                path = node.path
                node.left_child.Huffman_code = code + [1]
                node.right_child.Huffman_code = code + [0]
                node.left_child.path = path + [node.word_id]
                node.right_child.path = path + [node.word_id]
                # 把没走过的右子树加入栈
                stack.append(node.right_child)
                node = node.left_child
            word_id = node.word_id
            word_code = node.Huffman_code
            word_path = node.path
            self.huffman[word_id].Huffman_code = word_code
            self.huffman[word_id].path = word_path
            # 把节点计算得到的霍夫曼码、路径  写入词典的数值中
            self.wordid_code[word_id] = word_code
            self.wordid_path[word_id] = word_path

    # 获取所有词的正向节点id和负向节点id数组
    def get_all_pos_and_neg_path(self):
        positive = []  # 所有词的正向路径数组
        negative = []  # 所有词的负向路径数组
        for word_id in range(self.word_count):
            pos_id = []  # 存放一个词 路径中的正向节点id
            neg_id = []  # 存放一个词 路径中的负向节点id
            for i, code in enumerate(self.huffman[word_id].Huffman_code):
                if code == 1:
                    pos_id.append(self.huffman[word_id].path[i])
                else:
                    neg_id.append(self.huffman[word_id].path[i])
            positive.append(pos_id)
            negative.append(neg_id)
        return positive, negative


if __name__ == "__main__":
    word_frequency = {0: 7, 1: 8, 2: 3, 3: 2, 4: 2}
    print(word_frequency)
    tree = HuffmanTree(word_frequency)
    print(tree.wordid_code)
    print(tree.wordid_path)
    for i in range(len(word_frequency)):
        print(tree.huffman[i].path)
    print(tree.get_all_pos_and_neg_path())



In [5]:
#! -*- coding:utf-8 -*-
#Keras版的Word2Vec，作者：苏剑林，http://kexue.fm
#Keras 2.0.6 ＋ Tensorflow 测试通过

import numpy as np
from keras.layers import Input,Embedding,Lambda
from keras.models import Model
import keras.backend as K

word_size = 128 #词向量维度
window = 5 #窗口大小
nb_negative = 16 #随机负采样的样本数
min_count = 10 #频数少于min_count的词将会被抛弃
nb_worker = 4 #读取数据的并发数
nb_epoch = 2 #迭代次数，由于使用了adam，迭代次数1～2次效果就相当不错
subsample_t = 1e-5 #词频大于subsample_t的词语，会被降采样，这是提高速度和词向量质量的有效方案
nb_sentence_per_batch = 20
#目前是以句子为单位作为batch，多少个句子作为一个batch（这样才容易估计训练过程中的steps参数，另外注意，样本数是正比于字数的。）

import pymongo
class Sentences: #语料生成器，必须这样写才是可重复使用的
    def __init__(self):
        self.db = pymongo.MongoClient().weixin.text_articles
    def __iter__(self):
        for t in self.db.find(no_cursor_timeout=True).limit(100000):
            yield t['words'] #返回分词后的结果

sentences = Sentences()
words = {} #词频表
nb_sentence = 0 #总句子数
total = 0. #总词频

for d in sentences:
    nb_sentence += 1
    for w in d:
        if w not in words:
            words[w] = 0
        words[w] += 1
        total += 1
    if nb_sentence % 10000 == 0:
        print u'已经找到%s篇文章'%nb_sentence

words = {i:j for i,j in words.items() if j >= min_count} #截断词频
id2word = {i+1:j for i,j in enumerate(words)} #id到词语的映射，0表示UNK
word2id = {j:i for i,j in id2word.items()} #词语到id的映射
nb_word = len(words)+1 #总词数（算上填充符号0）

subsamples = {i:j/total for i,j in words.items() if j/total > subsample_t}
subsamples = {i:subsample_t/j+(subsample_t/j)**0.5 for i,j in subsamples.items()} #这个降采样公式，是按照word2vec的源码来的
subsamples = {word2id[i]:j for i,j in subsamples.items() if j < 1.} #降采样表

def data_generator(): #训练数据生成器
    while True:
        x,y = [],[]
        _ = 0
        for d in sentences:
            d = [0]*window + [word2id[w] for w in d if w in word2id] + [0]*window
            r = np.random.random(len(d))
            for i in range(window, len(d)-window):
                if d[i] in subsamples and r[i] > subsamples[d[i]]: #满足降采样条件的直接跳过
                    continue
                x.append(d[i-window:i]+d[i+1:i+1+window])
                y.append([d[i]])
            _ += 1
            if _ == nb_sentence_per_batch:
                x,y = np.array(x),np.array(y)
                z = np.zeros((len(x), 1))
                yield [x,y],z
                x,y = [],[]
                _ = 0

#CBOW输入
input_words = Input(shape=(window*2,), dtype='int32')
input_vecs = Embedding(nb_word, word_size, name='word2vec')(input_words)
input_vecs_sum = Lambda(lambda x: K.sum(x, axis=1))(input_vecs) #CBOW模型，直接将上下文词向量求和

#构造随机负样本，与目标组成抽样
target_word = Input(shape=(1,), dtype='int32')
negatives = Lambda(lambda x: K.random_uniform((K.shape(x)[0], nb_negative), 0, nb_word, 'int32'))(target_word)
samples = Lambda(lambda x: K.concatenate(x))([target_word,negatives]) #构造抽样，负样本随机抽。负样本也可能抽到正样本，但概率小。

#只在抽样内做Dense和softmax
softmax_weights = Embedding(nb_word, word_size, name='W')(samples)
softmax_biases = Embedding(nb_word, 1, name='b')(samples)
softmax = Lambda(lambda x: 
                    K.softmax((K.batch_dot(x[0], K.expand_dims(x[1],2))+x[2])[:,:,0])
                )([softmax_weights,input_vecs_sum,softmax_biases]) #用Embedding层存参数，用K后端实现矩阵乘法，以此复现Dense层的功能

#留意到，我们构造抽样时，把目标放在了第一位，也就是说，softmax的目标id总是0，这可以从data_generator中的z变量的写法可以看出

model = Model(inputs=[input_words,target_word], outputs=softmax)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#请留意用的是sparse_categorical_crossentropy而不是categorical_crossentropy

model.fit_generator(data_generator(), 
                    steps_per_epoch=nb_sentence/nb_sentence_per_batch, 
                    epochs=nb_epoch,
                    workers=nb_worker,
                    use_multiprocessing=True
                   )

model.save_weights('word2vec.model')

#通过词语相似度，检查我们的词向量是不是靠谱的
embeddings = model.get_weights()[0]
normalized_embeddings = embeddings / (embeddings**2).sum(axis=1).reshape((-1,1))**0.5

def most_similar(w):
    v = normalized_embeddings[word2id[w]]
    sims = np.dot(normalized_embeddings, v)
    sort = sims.argsort()[::-1]
    sort = sort[sort > 0]
    return [(id2word[i],sims[i]) for i in sort[:10]]

import pandas as pd
pd.Series(most_similar(u'科学'))


SyntaxError: Missing parentheses in call to 'print'. Did you mean print(u'已经找到%s篇文章'%nb_sentence)? (2295648212.py, line 41)