# 1. 获取数据

In [1]:
import pandas as pd

data = pd.read_csv('news_chinese.csv')
data = data.dropna()

# 2. 获得所有表示“说”的意思的单词

In [2]:
# -*- coding: utf-8 -*-
import os
LTP_DATA_DIR = '.\\ltp_data_v3.4.0\\'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`

from pyltp import Segmentor
segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型

def cut_words(text):
    words = segmentor.segment(text)  # 分词
    return words
#     segmentor.release()  # 释放模型

In [3]:
from pyltp import SentenceSplitter

def split_sentence(text):
    '''
    split text into single sentence without '\n'
    '''
    sentences = []
    try:
        sentence_list = text.split('\\n')
        for sentence in sentence_list:
            splited_sentence = SentenceSplitter.split(sentence)  # 分句
            sentences.extend(list(splited_sentence))
    except Exception:
        print('split_sentence exception:' + str(text))
        print(Exception)
    return sentences

In [4]:
def preprocess_text(text):
    """
    preprocess text, drop number, blank, stopwords
    return segments list
    """
#     stopwords=pd.read_csv('.\\stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
#     stopwords=stopwords['stopword'].values
    if len(text) == 1:
        return ''
    
    try:
        segs = list(cut_words(text))
#         segs = [v for v in segs if not str(v).isdigit()]#去数字
        segs = list(filter(lambda x:x.strip(), segs)) #去左右空格
#         segs = list(filter(lambda x:len(x)>1, segs))#长度为1的字符
#         segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词
    except Exception:
        print('preprocess_text exception:' + str(text))
        print(Exception)
    return segs

In [5]:
'''
preprocess and write corpus to file for future use
'''
with open('corpus.txt', 'w', encoding='utf-8') as f:
    for graph in data['content'].values:
        sentences = split_sentence(graph)  # 分句
        for sentence in sentences:
            words = preprocess_text(sentence)
            if words == '':
                continue
            f.write(' '.join(words))
            f.write('\n')

In [6]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
corpus = LineSentence('corpus.txt')
'''
LineSentence(inp)：格式简单：一句话=一行; 单词已经过预处理并被空格分隔。
size：是每个词的向量维度； 
window：是词向量训练时的上下文扫描窗口大小，窗口为5就是考虑前5个词和后5个词； 
min-count：设置最低频率，默认是5，如果一个词语在文档中出现的次数小于5，那么就会丢弃； 
workers：是训练的进程数（需要更精准的解释，请指正），默认是当前运行机器的处理器核数。这些参数先记住就可以了。
sg ({0, 1}, optional) – 模型的训练算法: 1: skip-gram; 0: CBOW
alpha (float, optional) – 初始学习率
iter (int, optional) – 迭代次数，默认为5
'''
model = Word2Vec(sentences=corpus, size=100, window=5, min_count=1, sg=1)

model.save(".\\word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
model['说']

  """Entry point for launching an IPython kernel.


array([-0.20866792,  0.7205171 , -1.0260113 , -0.06374749,  0.19556347,
       -0.10682994, -0.5851775 , -0.01481899,  0.3670453 ,  0.24029326,
        0.5949725 ,  0.28039786, -0.06872837, -0.33250424, -0.1844618 ,
       -0.5172169 ,  0.34398785, -0.58655286, -0.1243936 , -0.75491464,
        0.09888597,  0.40916127,  0.10244063,  0.7225233 ,  0.32497236,
        0.56063825,  0.8444974 , -0.3222012 ,  0.34923565,  0.27028653,
       -0.51589775, -0.5600994 ,  0.2264611 , -0.385016  , -0.4579577 ,
        0.08523443,  0.64386386, -0.10020302, -0.01174884, -0.51918715,
       -0.01156036, -0.22992441,  0.77415377,  0.05849807, -0.2401069 ,
       -0.00274309, -0.7530591 ,  0.07110906, -0.13691097,  0.1307326 ,
       -0.49360964, -0.21363465,  0.7133124 ,  0.357119  ,  0.47492936,
       -0.23782545,  1.0265316 , -0.13638072, -0.27368957, -0.03718648,
        0.7090336 , -0.76925814, -0.48834   ,  0.71585935,  0.19875243,
       -0.85786104,  0.28460357, -0.29654166, -0.14113064,  0.87

## 2.1 找相似词

Word Embedding Tutorial: word2vec using Gensim

https://www.guru99.com/word-embedding-word2vec.html

In [12]:
vocab = list(model.wv.vocab)
len(vocab)

223587

In [13]:
similar_words = model.most_similar('说')
print(similar_words)

[('表示', 0.8703572750091553), ('告诉', 0.8542202115058899), ('指出', 0.7788625955581665), ('认为', 0.7690681219100952), ('说道', 0.7643224000930786), ('坦言', 0.7485840916633606), ('介绍', 0.7473160624504089), ('看来', 0.7389413118362427), ('透露', 0.6973441243171692), ('写道', 0.6708030104637146)]


  """Entry point for launching an IPython kernel.


In [15]:
similarity_two_words = model.similarity('说','讲')
print("Please provide the similarity between these two words:")
print(similarity_two_words)

Please provide the similarity between these two words:
0.4619258715508007


  """Entry point for launching an IPython kernel.


In [16]:
similar = model.similar_by_word('说')
print(similar)

[('表示', 0.8703572750091553), ('告诉', 0.8542202115058899), ('指出', 0.7788625955581665), ('认为', 0.7690681219100952), ('说道', 0.7643224000930786), ('坦言', 0.7485840916633606), ('介绍', 0.7473160624504089), ('看来', 0.7389413118362427), ('透露', 0.6973441243171692), ('写道', 0.6708030104637146)]


  """Entry point for launching an IPython kernel.
