# 1. 获取数据

In [1]:
import pandas as pd

data = pd.read_csv('news_chinese.csv')
data = data.dropna()

# 2. 获得所有表示“说”的意思的单词

In [2]:
# -*- coding: utf-8 -*-
import os
LTP_DATA_DIR = '.\\ltp_data_v3.4.0\\'  # ltp模型目录的路径
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`

from pyltp import Segmentor
segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型

def cut_words(text):
    words = segmentor.segment(text)  # 分词
    return words
#     segmentor.release()  # 释放模型

In [3]:
from pyltp import SentenceSplitter

def split_sentence(text):
    '''
    split text into single sentence without '\n'
    '''
    sentences = []
    try:
        sentence_list = text.split('\\n')
        for sentence in sentence_list:
            splited_sentence = SentenceSplitter.split(sentence)  # 分句
            sentences.extend(list(splited_sentence))
    except Exception:
        print('split_sentence exception:' + str(text))
        print(Exception)
    return sentences

In [4]:
def preprocess_text(text):
    """
    preprocess text, drop number, blank, stopwords
    return segments list
    """
#     stopwords=pd.read_csv('.\\stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
#     stopwords=stopwords['stopword'].values
    if len(text) == 1:
        return ''
    
    try:
        segs = list(cut_words(text))
#         segs = [v for v in segs if not str(v).isdigit()]#去数字
        segs = list(filter(lambda x:x.strip(), segs)) #去左右空格
#         segs = list(filter(lambda x:len(x)>1, segs))#长度为1的字符
#         segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词
    except Exception:
        print('preprocess_text exception:' + str(text))
        print(Exception)
    return segs

In [5]:
'''
preprocess and write corpus to file for future use
'''
with open('corpus.txt', 'w', encoding='utf-8') as f:
    for graph in data['content'].values:
        sentences = split_sentence(graph)  # 分句
        for sentence in sentences:
            words = preprocess_text(sentence)
            if words == '':
                continue
            f.write(' '.join(words))
            f.write('\n')

In [6]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
corpus = LineSentence('corpus.txt')
'''
LineSentence(inp)：格式简单：一句话=一行; 单词已经过预处理并被空格分隔。
size：是每个词的向量维度； 
window：是词向量训练时的上下文扫描窗口大小，窗口为5就是考虑前5个词和后5个词； 
min-count：设置最低频率，默认是5，如果一个词语在文档中出现的次数小于5，那么就会丢弃； 
workers：是训练的进程数（需要更精准的解释，请指正），默认是当前运行机器的处理器核数。这些参数先记住就可以了。
sg ({0, 1}, optional) – 模型的训练算法: 1: skip-gram; 0: CBOW
alpha (float, optional) – 初始学习率
iter (int, optional) – 迭代次数，默认为5
'''
model = Word2Vec(sentences=corpus, size=100, window=5, min_count=1, sg=1)

model.save(".\\word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
model['说']

  """Entry point for launching an IPython kernel.


array([-0.20866792,  0.7205171 , -1.0260113 , -0.06374749,  0.19556347,
       -0.10682994, -0.5851775 , -0.01481899,  0.3670453 ,  0.24029326,
        0.5949725 ,  0.28039786, -0.06872837, -0.33250424, -0.1844618 ,
       -0.5172169 ,  0.34398785, -0.58655286, -0.1243936 , -0.75491464,
        0.09888597,  0.40916127,  0.10244063,  0.7225233 ,  0.32497236,
        0.56063825,  0.8444974 , -0.3222012 ,  0.34923565,  0.27028653,
       -0.51589775, -0.5600994 ,  0.2264611 , -0.385016  , -0.4579577 ,
        0.08523443,  0.64386386, -0.10020302, -0.01174884, -0.51918715,
       -0.01156036, -0.22992441,  0.77415377,  0.05849807, -0.2401069 ,
       -0.00274309, -0.7530591 ,  0.07110906, -0.13691097,  0.1307326 ,
       -0.49360964, -0.21363465,  0.7133124 ,  0.357119  ,  0.47492936,
       -0.23782545,  1.0265316 , -0.13638072, -0.27368957, -0.03718648,
        0.7090336 , -0.76925814, -0.48834   ,  0.71585935,  0.19875243,
       -0.85786104,  0.28460357, -0.29654166, -0.14113064,  0.87

## 2.1 找相似词

Word Embedding Tutorial: word2vec using Gensim

https://www.guru99.com/word-embedding-word2vec.html

In [3]:
'''
load gensim word2vec model
'''
from gensim.models import Word2Vec
model = Word2Vec.load(".\\word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
vocab = list(model.wv.vocab)
len(vocab)

198596

In [4]:
similar_words = model.most_similar('说', topn=20)
print(similar_words)

  """Entry point for launching an IPython kernel.


[('表示', 0.8741297125816345), ('告诉', 0.8271186351776123), ('指出', 0.788396954536438), ('认为', 0.7727850675582886), ('介绍', 0.7356007695198059), ('说道', 0.7067561745643616), ('看来', 0.6960127353668213), ('写道', 0.6913279891014099), ('话', 0.6870427131652832), ('强调', 0.6705794334411621), ('坦言', 0.6692733764648438), ('提到', 0.6580647230148315), ('称', 0.6515368223190308), ('透露', 0.6492000222206116), ('举例', 0.6212161779403687), ('形容', 0.611984372138977), ('证实', 0.6084652543067932), ('得知', 0.606695294380188), ('说法', 0.6029568910598755), ('问', 0.6021037101745605)]


In [5]:
similarity_two_words = model.similarity('说','讲')
print("Please provide the similarity between these two words:")
print(similarity_two_words)

Please provide the similarity between these two words:
0.3628039284301186


  """Entry point for launching an IPython kernel.


In [6]:
similar = model.similar_by_word('说', topn=20)
print(similar)

[('表示', 0.8741297125816345), ('告诉', 0.8271186351776123), ('指出', 0.788396954536438), ('认为', 0.7727850675582886), ('介绍', 0.7356007695198059), ('说道', 0.7067561745643616), ('看来', 0.6960127353668213), ('写道', 0.6913279891014099), ('话', 0.6870427131652832), ('强调', 0.6705794334411621), ('坦言', 0.6692733764648438), ('提到', 0.6580647230148315), ('称', 0.6515368223190308), ('透露', 0.6492000222206116), ('举例', 0.6212161779403687), ('形容', 0.611984372138977), ('证实', 0.6084652543067932), ('得知', 0.606695294380188), ('说法', 0.6029568910598755), ('问', 0.6021037101745605)]


  """Entry point for launching an IPython kernel.


In [4]:
def find_similar_words(words, max_size, model):
    similar_words = []
    seen = []
    unseen = ['说']
    while len(similar_words) < max_size and len(unseen) != 0:
        unseen_word = unseen.pop(0)
        if unseen_word in similar_words:
            continue
        similars = [w for w, p in model.most_similar(unseen_word, topn=20)]
        unseen.extend(similars)
        similar_words.append(unseen_word)
        seen.append(unseen_word)
    return similar_words
    

In [5]:
import jieba

words = []
with open('corpus.txt', 'r', encoding='utf-8') as rf:
    for line in rf.readlines():
        words.extend(list(jieba.cut(line)))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\vick_\AppData\Local\Temp\jieba.cache
Loading model cost 0.977 seconds.
Prefix dict has been built succesfully.


In [9]:
len(words)

33172819

In [6]:
similar_words = find_similar_words(words, 20, model)
similar_words

  if __name__ == '__main__':


['说',
 '表示',
 '告诉',
 '指出',
 '认为',
 '说道',
 '坦言',
 '介绍',
 '看来',
 '透露',
 '写道',
 '举例',
 '强调',
 '深有感触',
 '称',
 '话',
 '直言',
 '说法',
 '提到',
 '笑言']

# 3. 使用 NER，Dependency Parsing等对句子形式进行解析

In [40]:
# -*- coding: utf-8 -*-
import os
LTP_DATA_DIR = '.\\ltp_data_v3.4.0\\'  # ltp模型目录的路径
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`pos.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`
srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl_win.model')  # 语义角色标注模型目录路径，模型目录为`srl`。注意该模型路径是一个目录，而不是一个文件。

from pyltp import Postagger
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path)  # 加载模型

from pyltp import NamedEntityRecognizer
recognizer = NamedEntityRecognizer() # 初始化实例
recognizer.load(ner_model_path)  # 加载模型

from pyltp import Parser
parser = Parser() # 初始化实例
parser.load(par_model_path)  # 加载模型

from pyltp import SementicRoleLabeller
labeller = SementicRoleLabeller() # 初始化实例
labeller.load(srl_model_path)  # 加载模型

In [None]:
maxsize = 30
count = 0

with open('corpus.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        words = line.split(' ')
        keyword = contain_keywords(similar_words, words) # 句子是否包含“说”的同义词
        if keyword == '':
            continue
        print('%d:%s' % (words.index(keyword), keyword))
        postags = postagger.postag(words)  # 词性标注
        netags = recognizer.recognize(words, postags)  # 命名实体识别
        arcs = parser.parse(words, postags)  # 句法分析
        
        # 打印词性标注，命名实体识别，句法分析结果(可注释)
        print('序号' + '\t' + '词' + '\t' + '词性' + '\t' + '命名实体' + '\t' + '依存句法')
        print('--------------------------------------------')
        word_index = 0
        for w,p,n,a in zip(words, postags, netags, ("%d:%s" % (arc.head, arc.relation) for arc in arcs)):
            print(str(word_index) + '\t' + w + '\t' + p + '\t' + n + '\t' + a)
            word_index += 1
        
        # arcs 使用依存句法分析的结果
        roles = labeller.label(words, postags, arcs)  # 语义角色标注
        
        # 打印依存句法分析结果(可注释)
        for role in roles:
            print(role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
        
        for role in roles:
            if words.index(keyword) == role.index:
                for arg in role.arguments:
                    if arg.name == 'A0':
                        sub = ''.join(words[arg.range.start:arg.range.end])
                        print("观点主体" + sub)
        
        print('\n======================================================================\n')
        count += 1
        if count == maxsize:
            break

postagger.release()  # 释放模型
recognizer.release()  # 释放模型
parser.release()  # 释放模型
labeller.release()  # 释放模型

In [7]:
def contain_keywords(keywords, words):
    '''
    return key word in words, return empty if not match
    '''
    for keyword in keywords:
        if keyword in words:
            return keyword
    return ''

## 3.2 命名实体识别

In [2]:
# -*- coding: utf-8 -*-
import os
LTP_DATA_DIR = '.\\ltp_data_v3.4.0\\'  # ltp模型目录的路径
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`pos.model`

from pyltp import NamedEntityRecognizer
recognizer = NamedEntityRecognizer() # 初始化实例
recognizer.load(ner_model_path)  # 加载模型

# words = ['元芳', '你', '怎么', '看']
# postags = ['nh', 'r', 'r', 'v']
netags = recognizer.recognize(words, postags)  # 命名实体识别

print('\t'.join(netags))
recognizer.release()  # 释放模型

S-Ni	B-Ns	E-Ns	O	O	O	O	O	O	O	O
