In [2]:
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec
# 读入训练集文件
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]
# 词向量模型训练
model = Word2Vec(corpus, sg=0, vector_size=300, window=5, min_count=3, workers=4)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Windows\TEMP\jieba.cache
Loading model cost 1.086 seconds.
Prefix dict has been built successfully.


In [3]:
#模型显示
print('模型参数：',model,'\n')

模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 



In [4]:
from gensim.models.word2vec import Word2Vec
print('最匹配的词是：',model.wv.most_similar(positive=['点赞', '不错'], negative=['难吃']),'\n')

最匹配的词是： [('团购', 0.9321186542510986), ('位置', 0.9245730042457581), ('适合', 0.921338677406311), ('高', 0.917202353477478), ('性价比', 0.9157055616378784), ('上餐', 0.9128907918930054), ('好找', 0.9091496467590332), ('口味', 0.9062504172325134), ('足下', 0.9015496373176575), ('不太多', 0.900895357131958)] 



In [5]:
#最不匹配
#print('最不匹配的词是：',model.wv.doesnt_match("点赞 好吃 支持 难吃".split()),'\n')

In [6]:
#语义相似度
print('相似度为=',model.wv.similarity('推荐','好吃'),'\n')

相似度为= 0.82065076 



In [7]:
#坐标返回
print(model.wv.__getitem__('地道'))

[-5.31726284e-03  1.17537953e-01  2.76545454e-02  6.71510324e-02
 -7.19701275e-02 -1.10626452e-01  8.89725611e-02  3.31022680e-01
  1.31555442e-02 -3.01972814e-02 -3.53493690e-02 -1.03668116e-01
 -5.02594523e-02 -2.32045501e-02 -1.20835572e-01 -6.50701746e-02
  1.07922904e-01 -1.17752347e-02  3.97074185e-02 -8.77231658e-02
 -4.25220728e-02 -7.12646358e-03  9.38390102e-03  2.76437216e-02
  7.57360607e-02 -3.66829634e-02 -1.72135189e-01  4.06835899e-02
 -2.28455700e-02 -1.16796538e-01  7.04509392e-02 -6.30980283e-02
  3.79613973e-02  2.72261165e-03 -8.35431814e-02  1.92696061e-02
  8.75452608e-02 -1.55263662e-01  3.76794040e-02  5.71523197e-02
 -9.01424140e-02  2.30833497e-02  2.67092437e-02 -1.14419624e-01
  8.35922062e-02  1.19468525e-01  6.02213629e-02 -2.50046458e-02
  1.09325415e-02  1.06330581e-01  3.35309952e-02 -5.98531682e-04
 -1.54230613e-02  4.49417271e-02 -3.15579362e-02  1.26666039e-01
  5.90173863e-02  1.06403669e-02  3.72011028e-02  2.21485589e-02
 -5.24492525e-02 -5.30016

In [9]:
# 任务1：Skip-Gram 模型训练
import pandas as pd
import jieba
from gensim.models import Word2Vec

# 读入
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)

# 分词，清洗标点符号
def clean_and_tokenize(text):
    text = text.replace("，", "").replace("!", "").replace("！", "").replace("。", "")
    text = text.replace("~", "").replace("；", "").replace("？", "").replace("?", "")
    text = text.replace("【", "").replace("】", "").replace("#", "")
    return jieba.lcut(text)

corpus = [clean_and_tokenize(text) for text in corpus]

# 使用 Skip-Gram 训练模型（sg=1）
model = Word2Vec(corpus, sg=1, vector_size=300, window=5, min_count=3, workers=4)

# 输出模型参数
print("模型参数：", model)

模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025>


In [10]:
# 任务2：获取“环境”的词向量
try:
    env_vector = model.wv['环境']
    print("“环境”的词向量前5维：", env_vector[:5])
    print("词向量形状：", env_vector.shape)
except KeyError:
    print("“环境”不在词汇表中！")

“环境”的词向量前5维： [ 0.14709553  0.14983015  0.06914461  0.26695785 -0.05911557]
词向量形状： (300,)


In [11]:
# 任务3：查找与“好吃”最接近的3个词
try:
    similar_words = model.wv.most_similar('好吃', topn=3)
    print("与“好吃”最接近的3个词：", similar_words)
except KeyError:
    print("“好吃”不在词汇表中！")

与“好吃”最接近的3个词： [('入味', 0.864830732345581), ('棒', 0.8447458744049072), ('尤其', 0.8281573057174683)]


In [12]:
# 任务4：计算词对相似度
def calculate_similarity(word1, word2):
    try:
        similarity = model.wv.similarity(word1, word2)
        print(f"“{word1}”和“{word2}”的相似度：{similarity:.2f}")
    except KeyError as e:
        print(f"错误：{e} 不在词汇表中！")

calculate_similarity('好吃', '美味')
calculate_similarity('好吃', '蟑螂')

“好吃”和“美味”的相似度：0.81
“好吃”和“蟑螂”的相似度：0.29


In [13]:
# 任务5：执行向量运算
try:
    result = model.wv.most_similar(
        positive=['餐厅', '聚会'],
        negative=['安静'],
        topn=1
    )
    print("向量运算结果：“餐厅 + 聚会 - 安静”最接近：", result[0][0])
except KeyError as e:
    print(f"错误：{e} 不在词汇表中！")

向量运算结果：“餐厅 + 聚会 - 安静”最接近： 家庭聚会
