In [1]:
import numpy as np
from scipy.sparse.linalg import svds
from collections import defaultdict, Counter
from scipy.sparse import coo_matrix

In [2]:
file_path = 'ans.txt'
window_size=5
vocab_size=20000

In [3]:
# 构建共现矩阵
def build_cooccurrence_matrix(file_path, window_size=5, vocab_size = 10000):
    # 读取文本文件中的所有单词，并统计每个单词出现的次数
    with open(file_path, 'r', encoding='utf-8') as file:
        words = file.read().split()

    count = []
    count.extend(Counter(words).most_common(vocab_size))

    # 将出现次数最多的前vocab_size个单词作为词汇表
    vocab = [v[0] for v in count]
    vocab = set(vocab)

    # 创建一个空的defaultdict，用于记录每对单词的共现频率
    cooccurrence_matrix = defaultdict(int)

    # 计算每个词语的频率和共现频率
    for i, word in enumerate(words):
        if word in vocab:
            for j in range(i - window_size, i + window_size + 1):
                if j >= 0 and j < len(words) and word != words[j] and words[j] in vocab:
                    cooccurrence_matrix[word, words[j]] += 1

    # 创建一个从词语到索引的映射
    word_to_index = {word: index for index, word in enumerate(vocab)}

    # 初始化一个COO格式的稀疏矩阵，其行数和列数等于词汇表的大小
    cmatrix = coo_matrix((vocab_size, vocab_size))

    # 填充稀疏矩阵的值、行索引和列索引
    rows = []
    cols = []
    data = []
    for (i, j), value in cooccurrence_matrix.items():
        row_index = word_to_index[i]
        col_index = word_to_index[j]
        rows.append(float(row_index))
        cols.append(float(col_index))
        data.append(float(value))

    # 将行索引、列索引和数据列表赋值给COO格式的稀疏矩阵
    cmatrix.data = np.array(data)
    cmatrix.row = np.array(rows)
    cmatrix.col = np.array(cols)

    # 转换为CSR格式，这样可以进行更多的稀疏矩阵操作
    csr_matrix = cmatrix.tocsr()

    return csr_matrix, vocab, word_to_index

In [4]:
csr_matrix, vocab, word_to_index = build_cooccurrence_matrix(file_path, window_size=window_size, vocab_size=vocab_size)

In [5]:
# 定义一个函数，用于将稀疏矩阵应用SVD分解并降维
def svd_embedding(csr_matrix, n_components=5):
    # 使用SVD进行降维，返回三个矩阵：U、sigma和Vt
    U, sigma, Vt = svds(csr_matrix, k=n_components)
    
    # 重新组合U和Vt来形成词向量
    embeddings = np.dot(U, np.diag(sigma))
    
    # 计算每个词向量的范数
    scalar_array = np.linalg.norm(embeddings, axis=1)
    
    # 对词向量进行归一化处理
    embeddings = (embeddings.T / scalar_array).T
    
    return embeddings  # 返回降维后的词向量

In [6]:
embeddings, sigma = svd_embedding(csr_matrix, n_components=10)

In [7]:
# 读取文件并解析每一行
with open('pku_sim_test.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 初始化一个列表来存储每对子词的余弦相似度
similarity_list = []

# 遍历每一行
for line in lines:
    # 分割每行中的两个子词
    words = line.strip().split()
    
    # 检查两个子词是否都在word_embeddings中
    if len(words) == 2 and words[0] in vocab and words[1] in vocab:
        # 获取两个子词的词向量
        vec1 = embeddings[word_to_index[words[0]]]
        vec2 = embeddings[word_to_index[words[1]]]
        
        # 归一化向量
        vec1_normalized = vec1 / np.linalg.norm(vec1)
        vec2_normalized = vec2 / np.linalg.norm(vec2)
        
        # 计算余弦相似度
        dot_product = np.dot(vec1, vec2)

        sim_svd = dot_product 
        
        # 将余弦相似度添加到列表中
        similarity_list.append(sim_svd)
    else:
        # 如果任一词向量不存在，设置相似度为0
        similarity_list.append(0.0)

# 打印或存储余弦相似度结果
for words, sim in zip(lines, similarity_list):
    print(f'Words: {words.strip()}, Cosine Similarity: {sim}')

Words: 没戏	没辙, Cosine Similarity: 0.0
Words: 只管	尽管, Cosine Similarity: 0.9367185644738738
Words: GDP	生产力, Cosine Similarity: 0.9699678618230898
Words: 包袱	段子, Cosine Similarity: 0.0
Words: 由此	通过, Cosine Similarity: 0.9876394811448563
Words: 日期	时间, Cosine Similarity: 0.9777036838558274
Words: 爱面子	好高骛远, Cosine Similarity: 0.0
Words: 严厉	严谨, Cosine Similarity: 0.9619034018858785
Words: 一方面	一边, Cosine Similarity: 0.9163011574531352
Words: 托福	GRE, Cosine Similarity: 0.0
Words: 亏	幸亏, Cosine Similarity: 0.0
Words: 蹩脚	差强人意, Cosine Similarity: 0.0
Words: 容易	顺利, Cosine Similarity: 0.9244490963214902
Words: 悲喜	大悲大喜, Cosine Similarity: 0.0
Words: 老气	土气, Cosine Similarity: 0.0
Words: 狭隘	狭窄, Cosine Similarity: 0.9698297337290985
Words: 抄袭	克隆, Cosine Similarity: 0.0
Words: 害臊	腼腆, Cosine Similarity: 0.0
Words: 幻境	红楼梦, Cosine Similarity: 0.0
Words: 依稀	清晰, Cosine Similarity: 0.9540591418157502
Words: 权限	权力, Cosine Similarity: 0.9631328655496383
Words: 伟大	壮烈, Cosine Similarity: 0.9753527383550187
Words: 娇艳	

In [68]:
# 定义一个函数，用于获取稀疏矩阵的奇异值分解结果
def get_sigma(csr_matrix, choose=10):
    # 对稀疏矩阵进行奇异值分解，并获取奇异值数组
    sigma = np.linalg.svd(csr_matrix.toarray(), compute_uv=False)

    # 获取词汇表大小
    vocab_size = len(sigma)

    # 指定要保存的文件路径
    file_path = f"sigma/array_data_vocab_size_{vocab_size}.txt"

    # 使用 numpy.savetxt 函数将数组写入文本文件
    np.savetxt(file_path, sigma)

    # 统计奇异值数组中为0的元素个数
    zero_count = 0
    for i in sigma:
        if i == 0.0:
            zero_count += 1

    # 计算所有奇异值之和、前choose个奇异值之和，并返回这些值
    sum_sigma = sum(sigma)
    sum_choose = sum(sigma[:choose])
    return sigma, zero_count, sum_sigma, sum_choose


In [69]:
sigma, zero_count, sum_sigma, sum_choose = get_sigma(csr_matrix)

In [None]:
zero_count, sum_sigma, sum_choose

In [6]:
import numpy as np

def load_sigma(file_path, choose=100):
    # 使用NumPy的loadtxt函数加载奇异值数组
    sigma = np.loadtxt(file_path)

    zero_count = 0
    for i in sigma:
        if i == 0.0:
            zero_count += 1

    one_count = 0
    for i in sigma:
        if i < 0.01:
            one_count += 1

    sum_sigma = sum(sigma)
    sum_choose = sum(sigma[:choose])
    return sigma, one_count, zero_count, sum_sigma, sum_choose, sum_choose / sum_sigma


# 指定奇异值数组保存的文件路径
file_path = "sigma/array_data_vocab_size_20000.txt"

# 调用load_sigma函数加载奇异值数组
sigma, one_count, zero_count, sum_sigma, sum_choose, bili = load_sigma(file_path)

In [7]:
one_count, zero_count, sum_sigma, sum_choose, bili

(24, 0, 561109.1086706324, 211065.9964873199, 0.3761585638618003)