# Python文本分析
#### 安装相关库

In [None]:
!pip install jieba
!pip install wordcloud

## 制作词云

#### 导入`jieba`分词模块：

In [None]:
import jieba

#### 体验一下分词：

In [None]:
s = "结婚的和尚未结婚的"

In [None]:
list(jieba.cut(s))

#### 导入文本文件，读入一个字符串，并进行分词：

In [None]:
with open('test.txt') as f:
    s = f.read()
    
cut_s = list(jieba.cut(s))

#### 初始化`wordcloud`模块：

In [None]:
import wordcloud
wc = wordcloud.WordCloud(background_color = "white", font_path = "TW-Kai-98_1.ttf")

#### 将分词重新拼接为一个字符串，绘制词云：

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
myword = wc.generate(' '.join(cut_s))
plt.imshow(myword)
plt.axis("off")
plt.show()

## 基于情感词典的情感分析

#### 导入停用词表

In [86]:
with open('stopwords.txt',encoding='utf-8') as f:
    stopwords = f.read().split('\n')

#### 导入情感词汇

In [None]:
with open('BosonNLP_sentiment_score.txt',encoding='utf-8') as f:
    kv = [line.split() for line in f ]

In [None]:
sendict = {k:float(v) for (k,v) in kv}

#### 导入否定词汇

In [None]:
with open("deny.txt",encoding="utf-8") as f:
    denywords = f.read().split("\n")

#### 导入程度副词

In [None]:
with open('level_adv.txt',encoding='gbk') as f:
    level = f.read().split('\n')

In [None]:
for i,w in enumerate(level):
    print(i,w)

#### 按程度分为6组

In [None]:
level1 = level[188:200]

In [None]:
level2 = level[157:186]

In [None]:
level3 = level[202:232]

In [None]:
level4 = level[118:155]

In [None]:
level5 = level[74:116]

In [None]:
level6 = level[3:72]

#### 按程度赋予得分

In [None]:
levels = [(level1,0.5),(level2,0.8),(level3,1.1),(level4,1.3),(level5,1.6),(level6,2)]

#### 构造程度词字典

In [None]:
from functools import reduce

In [None]:
levels_with_score = [list(map(lambda x:(x,v),k)) for k,v in levels]

In [None]:
leveldict= dict(reduce(lambda x,y:x+y,levels_with_score))

#### 取得程度词和情感词的列表
现在已经有三个列表，分别是否定词、情感词和程度副词列表，可用于判断一个词是不是属于三种词之一。

In [None]:
levelwords = leveldict.keys()
senwords = sendict.keys()

In [None]:
'不胜' in levelwords

In [None]:
'不' in denywords

In [None]:
'晚餐' in senwords

#### 定义词性判断函数

In [None]:
def is_senword(word):
    return word in senwords

def is_denyword(word):
    return word in denywords

def is_levelword(word):
    return word in levelwords

#### 定义函数，将字符串分词，去除停用词，并构造情感词词组列表
情感词词组由（否定词+程度副词+情感词）组成。

In [None]:
def append_sen(l,y):
    l[-1].append(y)
    l.append([])
    return l

def append_other(l,y):
    l[-1].append(y)
    return l

In [None]:
def to_phrases(s):
    s = list(filter(lambda x : x not in stopwords,jieba.cut(s)))
    s = list(filter(lambda x:is_denyword(x) or is_levelword(x) or is_senword(x),s))
    return reduce(lambda x,y: append_other(x,y) if is_denyword(y) or is_levelword(y) else append_sen(x,y),s,[[]])[:-1]

In [None]:
is_senword("完全")

In [None]:
to_phrases("非常好的酒店,服务也很好.我住的是行政豪华房,里面干净宽敞舒服,对面可以看到锦绣中华的全景,晚上还可以看到那里的烟花,好漂亮.还可以延长到下午的4:30之前退房.下次还会选择住这家酒店")

#### 定义词组评分函数
为简单起见，我们定义情感评分为：$$(-1) ^ {|denywords|} * levelscore * senscore$$


In [None]:
def phrase_score(phrase):
    score_deny =(-1)**len(list(filter(is_denyword,phrase)))
    senw = filter(is_senword,phrase)
    score_sen = sum([sendict[s] for s in senw])
    levelw = filter(is_levelword,phrase)
    score_level = 1 if len(list(levelw))==0 else sum([leveldict[l] for l in levelw])
    return score_deny * score_level * score_sen

In [None]:
phrase_score(['未', '更换'])

#### 定义评价句子的函数

In [None]:
import numpy as np
def sentence_score(sentence):
    phrases = to_phrases(sentence)
    return np.mean([phrase_score(p) for p in phrases]) if len(phrases)>0 else 0

In [None]:
sentence_score("宾馆虽然比较老，房间感觉还可以，服务很好，早餐品种少了点。打车起步价到海边。走路15分钟到好又多超市。在开发区里面出差住在这里很方便。")

In [None]:
sentence_score("地理位置还不错，到哪里都比较方便，但是服务不象是豪生集团管理的，比较差。下午睡了一觉并洗了一个澡，本来想让酒店再来打扫一下，所以，打开了，请打扫的服务灯，可是到晚上回酒店，发现打扫得服务灯被关掉了，而房间还是没有打扫过。")

### 读取语料库，测试正确率

#### 定义函数读取文件，去除空格和换行，得到待评价字符串。

In [None]:
def file2string(file):
    with open(file,encoding='gbk',errors='ignore') as f:
        s = f.read().replace(" ","").replace("\n","")
    return s

In [None]:
file2string("pos/pos.1001.txt")

#### 取得所有正负文本的路径

In [None]:
import os

In [None]:
pos = [os.path.join("pos",p) for p in os.listdir("pos")]

In [None]:
neg = [os.path.join("neg",p) for p in os.listdir("neg")]

#### 正向情感文本的得分情况

In [None]:
pos_sentences = [file2string(f) for f in pos]

In [None]:
import numpy as np

In [None]:
np.mean([sentence_score(s) for s in pos_sentences ])

#### 负向情感文本的得分情况

In [None]:
neg_sentences = [file2string(f) for f in neg]

In [None]:
np.mean([sentence_score(s) for s in neg_sentences])

#### 比较正向文本与负向文本的得分分布

In [None]:
import seaborn as sns

In [None]:
sns.distplot([sentence_score(s) for s in neg_sentences])
sns.distplot([sentence_score(s) for s in pos_sentences])