In [1]:
import numpy as np
from scipy.sparse.linalg import svds
from scipy.sparse import coo_matrix
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import string
from zhon.hanzi import punctuation as pun
 
# 将所有英文标点符号和中文标点符号合并为一个字符串
allPun = string.punctuation + pun
 
def delPunctuation(infile, outfile):
    # 删除文本中的所有标点符号
    with open(infile, 'r',encoding="utf-8") as readFile, open(outfile, 'w', encoding="utf-8") as writeFile:
        for idx, line in enumerate(readFile):
            # 将每行中的非标点符号字符连接起来
            out = ''.join([i for i in line if i not in allPun])
            writeFile.write(out)
    readFile.close()
    writeFile.close()

# delPunctuation("training.txt", "ans.txt")

In [3]:
# 读取语料库文件
def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # 读取文本并按行分割
        sentences = file.readlines()
    return sentences

In [4]:
# 构建共现矩阵
def build_cooccurrence_matrix(sentences, window_size=5):
    cooccurrence_matrix = defaultdict(int)

    # 计算每个词语的频率和共现频率
    for sentence in sentences:
        words = sentence.strip().split()
        for i, word in enumerate(words):
            for j in range(i - window_size, i + window_size + 1):
                if j >= 0 and j < len(words) and i != j:
                    cooccurrence_matrix[word, words[j]] += 1

    # vocab是包含所有唯一词语的集合
    vocab = set(word for word_pair in cooccurrence_matrix.keys() for word in word_pair)

    # 创建一个从词语到索引的映射
    word_to_index = {word: index for index, word in enumerate(vocab)}

    # 初始化一个COO格式的稀疏矩阵，其行数和列数等于词汇表的大小
    cmatrix = coo_matrix((len(vocab), len(vocab)))

    # 填充稀疏矩阵的值、行索引和列索引
    rows = []
    cols = []
    data = []
    for (i, j), value in cooccurrence_matrix.items():
        row_index = word_to_index[i]
        col_index = word_to_index[j]
        rows.append(float(row_index))
        cols.append(float(col_index))
        data.append(float(value))

    # 将行索引、列索引和数据列表赋值给COO格式的稀疏矩阵
    cmatrix.data = np.array(data)
    cmatrix.row = np.array(rows)
    cmatrix.col = np.array(cols)

    # 转换为CSR格式，这样可以进行更多的稀疏矩阵操作
    csr_matrix = cmatrix.tocsr()
    

    return csr_matrix, vocab, word_to_index

In [5]:
# 应用SVD分解并降维
def svd_embedding(csr_matrix, n_components=5):
    # 使用SVD进行降维
    U, sigma, Vt = svds(csr_matrix, k=n_components)
    # 重新组合U和Vt来形成词向量
    embeddings = np.dot(U, np.diag(sigma))
    # 计算每个词向量的范数
    scalar_array = np.linalg.norm(embeddings, axis=1)
    # 对词向量进行归一化处理
    embeddings = (embeddings.T / scalar_array).T

    return embeddings


In [6]:
def Cosine_Similarity_test(testpath, vocab, word_to_index, embeddings):
    # 读取文件并解析每一行
    with open(testpath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 初始化一个列表来存储每对子词的余弦相似度
    similarity_list = []

    # 遍历每一行
    for line in lines:
        # 分割每行中的两个子词
        words = line.strip().split()
        
        # 检查两个子词是否都在word_embeddings中
        if len(words) == 2 and words[0] in vocab and words[1] in vocab:
            # 获取两个子词的词向量
            vec1 = embeddings[word_to_index[words[0]]]
            vec2 = embeddings[word_to_index[words[1]]]
            
            # 计算余弦相似度
            dot_product = np.dot(vec1, vec2)
            norm_vec1 = np.linalg.norm(vec1)
            norm_vec2 = np.linalg.norm(vec2)
            sim_svd = dot_product 
            
            # 将余弦相似度添加到列表中
            similarity_list.append(sim_svd)
        else:
            # 如果任一词向量不存在，设置相似度为0
            similarity_list.append(0.0)

    # # 打印或存储余弦相似度结果
    # for words, sim in zip(lines, similarity_list):
    #     print(f'Words: {words.strip()}, Cosine Similarity: {sim}')

    with open('result/svd.txt', 'w', encoding='utf-8') as file:
        for words, sim in zip(lines, similarity_list):
            str = f'Words: {words.strip()}, Cosine Similarity: {sim}' + '\n'
            file.write(str)

In [9]:
# 读取语料库
sentences = read_corpus('training.txt')
# 构建并应用SVD分解
cooccurrence_matrix, vocab, word_to_index = build_cooccurrence_matrix(sentences, 5)
word_embeddings = svd_embedding(cooccurrence_matrix, n_components=200)

In [10]:
Cosine_Similarity_test('pku_sim_test.txt', vocab, word_to_index, word_embeddings)