# 1. 获取数据

In [37]:
import pandas as pd

data = pd.read_csv('news_chinese.csv')
data = data.dropna()

# 2. 获得所有表示“说”的意思的单词

In [26]:
# -*- coding: utf-8 -*-
import os
LTP_DATA_DIR = '.\\ltp_data_v3.4.0\\'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`

from pyltp import Segmentor
segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型

def cut_words(text):
    words = segmentor.segment(text)  # 分词
    return words
#     segmentor.release()  # 释放模型

In [36]:
from pyltp import SentenceSplitter

def split_sentence(text):
    '''
    split text into single sentence without '\n'
    '''
    sentences = []
    try:
        sentence_list = text.split('\\n')
        for sentence in sentence_list:
            splited_sentence = SentenceSplitter.split(sentence)  # 分句
            sentences.extend(list(splited_sentence))
    except Exception:
        print('split_sentence exception:' + str(text))
        print(Exception)
    return sentences

In [33]:
def preprocess_text(text):
    """
    preprocess text, drop number, blank, stopwords
    return segments list
    """
#     stopwords=pd.read_csv('.\\stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
#     stopwords=stopwords['stopword'].values
    if len(text) == 1:
        return ''
    
    try:
        segs = list(cut_words(text))
#         segs = [v for v in segs if not str(v).isdigit()]#去数字
        segs = list(filter(lambda x:x.strip(), segs)) #去左右空格
#         segs = list(filter(lambda x:len(x)>1, segs))#长度为1的字符
#         segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词
    except Exception:
        print('preprocess_text exception:' + str(text))
        print(Exception)
    return segs

In [38]:
'''
preprocess and write corpus to file for future use
'''
with open('corpus.txt', 'w', encoding='utf-8') as f:
    for graph in data['content'].values:
        sentences = split_sentence(graph)  # 分句
        for sentence in sentences:
            words = preprocess_text(sentence)
            if words == '':
                continue
            f.write(' '.join(words))
            f.write('\n')

In [42]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
corpus = LineSentence('corpus.txt')
'''
LineSentence(inp)：格式简单：一句话=一行; 单词已经过预处理并被空格分隔。
size：是每个词的向量维度； 
window：是词向量训练时的上下文扫描窗口大小，窗口为5就是考虑前5个词和后5个词； 
min-count：设置最低频率，默认是5，如果一个词语在文档中出现的次数小于5，那么就会丢弃； 
workers：是训练的进程数（需要更精准的解释，请指正），默认是当前运行机器的处理器核数。这些参数先记住就可以了。
sg ({0, 1}, optional) – 模型的训练算法: 1: skip-gram; 0: CBOW
alpha (float, optional) – 初始学习率
iter (int, optional) – 迭代次数，默认为5
'''
model = Word2Vec(sentences=corpus, size=100, window=5, min_count=1, sg=1)

model.save(".\\word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
