In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import random
import numpy as np
import collections
import math
import torch.nn.functional as F
from torch.autograd import Variable

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
# 定义一个函数，用于读取语料库文件并将单词转化为对应的索引
def read_corpus(file_path, vocab_size):
    # 打开指定文件路径的语料库文件，并将内容按空格分割为单词列表
    with open(file_path, 'r', encoding='utf-8') as file:
        words = file.read().split()

    # 初始化一个计数列表，用于统计每个单词出现的次数，并将未知词标注为'UNK'
    count = [['UNK', -1]]
    # 使用 Counter 统计单词出现的次数，并取出现次数最多的前 vocab_size - 1 个单词
    count.extend(collections.Counter(words).most_common(vocab_size - 1))

    # 创建一个字典，将单词映射为索引
    word_index = dict()
    for word, _ in count:
        word_index[word] = len(word_index)

    # 将所有单词转化为对应的索引存储在 data 列表中，并统计未知词的数量
    data = list()
    unk_count = 0
    for word in words:
        if word in word_index:
            index = word_index[word]
        else:
            index = 0  # 将未知词标记为索引0，对应'UNK'
            unk_count += 1
        data.append(index)

    # 更新计数列表中'UNK'的出现次数
    count[0][1] = unk_count

    # 创建一个字典，将索引映射为单词
    index_word = dict(zip(word_index.values(), word_index.keys()))

    # 返回转化后的单词索引列表、计数列表、索引到单词的映射字典和单词到索引的映射字典
    return data, count, index_word, word_index


In [3]:
data, count, index_word, word_index = read_corpus('training.txt', 30000)

In [4]:
def subsampling(data, count):
    '''下采样 降低在训练模型时对高频词的过度关注'''
    # 统计每个单词出现的次数
    count = [ele[1] for ele in count]
    # 计算每个单词出现的概率
    frequency = np.array(count) / sum(count)

    # 计算每个单词调整以后的概率，并存储在 P 字典中
    P = dict()
    for idx, x in enumerate(frequency):
        y = (math.sqrt(x / 0.001) + 1) * 0.001 / x
        P[idx] = y

    # 对单词列表进行下采样，并将下采样后的单词存储在 subsampled_data 列表中
    subsampled_data = list()
    for word in data:
        if random.random() < P[word]:
            subsampled_data.append(word)

    # 返回下采样后的单词列表
    return subsampled_data

In [5]:
train_data = subsampling(data, count)

In [6]:
def init_sample_table(count, table_size=1e8):
    '''初始化采样表'''
    # 统计每个单词出现的次数
    count = [ele[1] for ele in count]
    # 计算每个单词出现的概率的 0.75 次方
    pow_frequency = np.array(count)**0.75
    # 计算概率的和
    power = sum(pow_frequency)
    # 计算每个单词在采样表中出现的次数
    ratio = pow_frequency / power
    count = np.round(ratio * table_size)

    # 创建采样表
    sample_table = []
    for idx, x in enumerate(count):
        sample_table += [idx] * int(x)

    # 将采样表转化为 numpy 数组，并返回
    return np.array(sample_table)


In [7]:
sample_table = init_sample_table(count)

In [8]:
# 定义一个函数，用于生成训练批次数据
def generate_batch(train_data, sample_table, neg_sample_num, window_size, batch_size):
    # 将训练数据赋值给变量 data
    data = train_data

    # 初始化全局变量 data_index，表示当前处理的数据索引
    global data_index

    # 计算上下文窗口的大小
    span = 2 * window_size + 1

    # 初始化上下文和标签数组
    context = np.ndarray(shape=(batch_size, 2 * window_size), dtype=np.int64)
    labels = np.ndarray(shape=(batch_size), dtype=np.int64)

    # 如果当前数据索引加上窗口大小超出了数据长度，则重新从数据开头开始
    if data_index + span > len(data):
        data_index = 0

    # 从数据中提取当前窗口的数据
    buffer = data[data_index:data_index + span]
    pos_u = []
    pos_v = []

    # 遍历每个批次
    for i in range(batch_size):
        # 更新数据索引
        data_index += 1
        # 获取上下文单词索引和标签单词索引
        context[i, :] = buffer[:window_size] + buffer[window_size + 1:]
        labels[i] = buffer[window_size]

        # 如果当前数据索引加上窗口大小超出了数据长度，则重新初始化数据索引和缓冲区
        if data_index + span > len(data):
            buffer[:] = data[:span]
            data_index = 0
        else:
            # 更新缓冲区为下一个窗口数据
            buffer = data[data_index:data_index + span]

        # 构建正样本对
        for j in range(span - 1):
            pos_u.append(labels[i])
            pos_v.append(context[i, j])

    # 从采样表中随机选择负样本
    neg_v = np.random.choice(sample_table, size=(batch_size * 2 * window_size, neg_sample_num))

    # 返回正样本和负样本数组
    return np.array(pos_u), np.array(pos_v), neg_v


In [9]:
# 定义一个 Word2Vec 类，用于训练 Word2Vec 模型
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, vector_size):
        super(Word2Vec, self).__init__()
        # 初始化词汇表大小和词向量维度
        self.vocab_size = vocab_size
        self.vector_size = vector_size

        # 初始化目标词和上下文词的嵌入层
        self.target_embeddings = nn.Embedding(vocab_size, vector_size, sparse=True)
        self.context_embeddings = nn.Embedding(vocab_size, vector_size, sparse=True)
        # 初始化嵌入层权重
        self.init_emb()

    # 初始化嵌入层权重的方法
    def init_emb(self):
        # 设置初始化范围
        initrange = 0.5 / self.vector_size
        # 初始化目标词的嵌入权重
        self.target_embeddings.weight.data.uniform_(-initrange, initrange)
        # 初始化上下文词的嵌入权重
        self.context_embeddings.weight.data.uniform_(-0, 0)

    # 前向传播方法，计算损失函数
    def forward(self, u_pos, v_pos, v_neg, batch_size):
        # 获取目标词和上下文词的嵌入表示
        embed_u = self.target_embeddings(u_pos)
        embed_v = self.context_embeddings(v_pos)

        # 计算正样本得分
        score = torch.mul(embed_u, embed_v)
        score = torch.sum(score, dim=1)
        log_target = torch.log(torch.sigmoid(score)).squeeze()

        # 获取负样本的上下文词的嵌入表示
        neg_embed_v = self.context_embeddings(v_neg)

        # 计算负样本得分
        neg_score = torch.bmm(neg_embed_v, embed_u.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim=1)
        sum_log_sampled = torch.log(torch.sigmoid(-1 * neg_score)).squeeze()

        # 计算损失函数
        loss = log_target + sum_log_sampled

        # 返回平均损失
        return -1 * loss.sum() / batch_size


In [10]:
# 定义一个训练函数，用于训练 Word2Vec 模型
def train(train_data, vocabulary_size, embedding_dim, epoch_num, batch_size, window_size, neg_sample_num):
    # 初始化 Word2Vec 模型
    model = Word2Vec(vocabulary_size, embedding_dim)
    # 如果 GPU 可用，则将模型移动到 GPU 上
    if torch.cuda.is_available():
        model.cuda()
    # 定义优化器
    optimizer = optim.SGD(model.parameters(), lr=0.2)
    
    # 计算总的批次数
    total_batches = len(train_data) // batch_size
    
    # 遍历每个 epoch
    for epoch in range(epoch_num):
        batch_num = 0

        # 使用 tqdm 包装 while 循环，用于显示进度条
        with tqdm(total=total_batches,  desc=f"Epoch {epoch}") as pbar:
            i = 0
            epoch_loss = 0.0  # 用于累积每个 epoch 的 loss
            while i < total_batches:
                # 生成训练批次数据
                pos_u, pos_v, neg_v = generate_batch(train_data, sample_table, neg_sample_num, window_size, batch_size)

                # 将数据转换为 PyTorch 张量，并将其移动到 GPU 上
                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v))
                if torch.cuda.is_available():
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                # 将梯度清零，计算损失函数，进行反向传播，更新模型参数
                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v, batch_size)
                loss.backward()
                optimizer.step()

                batch_num += 1
                
                # 每 30000 个批次保存一次模型
                if batch_num % 30000 == 0:
                    torch.save(model.state_dict(), './tmp/sgns.epoch{}.batch{}'.format(epoch, batch_num))

                i += 1
                # 累积每个 batch 的 loss
                epoch_loss += loss

                # 更新进度条并显示当前的 loss
                pbar.set_postfix({'Epoch Loss': epoch_loss / (pbar.n + 1)})  # 计算并显示平均 loss
                pbar.update(1)  # 更新进度条

    # 训练结束，输出提示信息并返回模型
    print("Optimization Finished!")
    return model


In [11]:
vocabulary_size = 30000
embedding_dim = 100
epoch_num = 5
batch_size = 32
windows_size = 2
neg_sample_num = 10

In [12]:
model = train(train_data, vocabulary_size, embedding_dim, epoch_num, batch_size, windows_size, neg_sample_num)

Epoch 0: 100%|██████████| 58093/58093 [05:47<00:00, 167.07it/s, Epoch Loss=tensor(3.8627, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 1: 100%|██████████| 58093/58093 [05:50<00:00, 165.73it/s, Epoch Loss=tensor(3.3761, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 2: 100%|██████████| 58093/58093 [06:02<00:00, 160.21it/s, Epoch Loss=tensor(3.1653, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 3: 100%|██████████| 58093/58093 [06:33<00:00, 147.52it/s, Epoch Loss=tensor(3.0202, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 4: 100%|██████████| 58093/58093 [06:58<00:00, 138.92it/s, Epoch Loss=tensor(2.9088, device='cuda:0', grad_fn=<DivBackward0>)]


Optimization Finished!


In [13]:
word_embeddings = model.target_embeddings.weight.data.cpu().numpy()

In [14]:
# 定义一个函数，用于将词向量保存到文件中
def save_embedding(embeds, file_name, id2word):
    # 打开文件并写入词向量
    with open(file_name, 'w', encoding='utf-8') as f:
        for idx in range(len(embeds)):
            if idx in id2word:
                word = id2word[idx]
                embed = ' '.join(map(str, embeds[idx]))  # 将浮点数转换为字符串后再连接
                f.write(word+' '+embed+'\n')


In [15]:
save_embedding(word_embeddings, 'embeding/sgns.txt', index_word)

In [16]:
def Cosine_Similarity_test(testpath, vocab, word_to_index, embeddings):
    # 读取文件并解析每一行
    with open(testpath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 初始化一个列表来存储每对子词的余弦相似度
    similarity_list = []

    # 遍历每一行
    for line in lines:
        # 分割每行中的两个子词
        words = line.strip().split()
        
        # 检查两个子词是否都在word_embeddings中
        if len(words) == 2 and words[0] in vocab and words[1] in vocab:
            # 获取两个子词的词向量
            vec1 = embeddings[word_to_index[words[0]]]
            vec2 = embeddings[word_to_index[words[1]]]

            # 归一化向量
            vec1_normalized = vec1 / np.linalg.norm(vec1)
            vec2_normalized = vec2 / np.linalg.norm(vec2)
            
            # 计算余弦相似度
            dot_product = np.dot(vec1_normalized, vec2_normalized)

            sim_svd = dot_product 
            
            # 将余弦相似度添加到列表中
            similarity_list.append(sim_svd)
        else:
            # 如果任一词向量不存在，设置相似度为0
            similarity_list.append(0.0)

    # # 打印或存储余弦相似度结果
    # for words, sim in zip(lines, similarity_list):
    #     print(f'Words: {words.strip()}, Cosine Similarity: {sim}')

    with open('result/sgns.txt', 'w', encoding='utf-8') as file:
        for words, sim in zip(lines, similarity_list):
            str = f'Words: {words.strip()}, Cosine Similarity: {sim}' + '\n'
            file.write(str)

In [17]:
Cosine_Similarity_test('pku_sim_test.txt', word_index, word_index, word_embeddings)