In [20]:
import seaborn as sns
import matplotlib.pyplot as plt
import jieba
import pandas as pd
import random



class Sentiment():

    def __init__(self, data):
        self.data = data
        self.stopwords = []
        self.posdic = []
        self.negdic = []
        self.mostdict = []
        self.verydict = []
        self.moredict = []
        self.ishdict = []
        self.insufficientdict = []
        self.inversedict = []

#     def read_file(self):
#         """
#         提取csv文件中的研报内容，存入list并返回
#         :param filename:
#         :return:
#         """
#         data = pd.read_csv(self.filename)
#         return list(data['评论'])

    def dict_load(self, path):
        dict = []
        with open(path, encoding='utf-8') as f:
            for line in f:
                if line.strip() != '':  # 养成去空好习惯
                    dict.append(line.strip())
        return dict

    def load_dicts(self):
        stop = 'dict/stop1205.txt'
        pos = 'dict/pos_all_dict.txt'
        neg =  'dict/neg_all_dict.txt'
        most =  'dict/most.txt'
        very = 'dict/very.txt'
        more = 'dict/more.txt'
        ish = 'dict/ish.txt'
        insufficient = 'dict/insufficiently.txt'
        inverse = 'dict/inverse.txt'
        self.stopwords = self.dict_load(stop)
        self.posdict = self.dict_load(pos)
        self.negdict = self.dict_load(neg)
        self.mostdict = self.dict_load(most)  # 权值为2
        self.verydict = self.dict_load(very)  # 权值为1.5
        self.moredict = self.dict_load(more)  # 权值为1.25
        self.ishdict = self.dict_load(ish)  # 权值为0.5
        self.insufficientdict = self.dict_load(insufficient)  # 权值为0.25
        self.inversedict = self.dict_load(inverse)  # 权值为-1

    def seg_sentence(self, sentence):
        """
        输入字符串，返回分词后的列表
        :param sentence:
        :return:
        """
        sentence_seged = jieba.cut(sentence.strip())
        outstr = ''
        for word in sentence_seged:
            if word not in self.stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr.split(' ')

    def match_adverb(self, word, sentiment_value):
        """
        对不同种类的词赋予不同的权重
        :param sentiment_value:
        :return:
        """
        # 最高级权重为
        if word in self.mostdict:
            sentiment_value *= 2  # 2/8
        # 比较级权重
        elif word in self.verydict:
            sentiment_value *= 1.75  # 1.75/6
        # 比较级权重
        elif word in self.moredict:
            sentiment_value *= 1.5  # 1.5/4
        # 轻微程度词权重
        elif word in self.ishdict:
            sentiment_value *= 1.2  # 1.2/2
        # 相对程度词权重
        elif word in self.insufficientdict:
            sentiment_value *= 0.5
        # 否定词权重
        elif word in self.inversedict:
            sentiment_value *= -1
        else:
            sentiment_value *= 1
        return sentiment_value

    def cal_score(self, words_list):

        # i，s 记录情感词和程度词出现的位置
        i = 0  # 记录扫描到的词位子
        s = 0  # 记录情感词的位置
        poscount = 0  # 记录积极情感词数目
        negcount = 0  # 记录消极情感词数目
        # 逐个查找情感词
        for word in words_list:
            # 如果为积极词
            if word in self.posdict:
                poscount += 1  # 情感词数目加1
                # 在情感词前面寻找程度副词
                for w in words_list[s:i]:
                    poscount = self.match_adverb(w, poscount)
                s = i + 1  # 记录情感词位置
            # 如果是消极情感词
            elif word in self.negdict:
                negcount += 1
                for w in words_list[s:i]:
                    negcount = self.match_adverb(w, negcount)
                s = i + 1
            # 如果结尾为感叹号或者问号，表示句子结束，并且倒序查找感叹号前的情感词，权重+4
            elif word == '!' or word == '！' or word == '?' or word == '？':
                for w2 in words_list[::-1]:
                    # 如果为积极词，poscount+2
                    if w2 in self.posdict:
                        poscount += 4
                        break
                    # 如果是消极词，negcount+2
                    elif w2 in self.negdict:
                        negcount += 4
                        break
            i += 1  # 定位情感词的位置
        # 计算情感值
        sentiment_score = poscount - negcount
        return sentiment_score

    def res(self, sentiment_score):
        # print('情感分值：', sentiment_score)
        if sentiment_score < 0:
            # print('情感倾向：消极')
            res = -1
        elif sentiment_score == 0:
            # print('情感倾向：中性')
            res = 1 # 中性标记为积极
        else:
            # print('情感倾向：积极')
            res = 1
        return res

    def run(self):
        """
        :return: 两个列表，一个列表存放分数，一个列表存放结果
        """
        content_list = list(self.data['评论'])
        self.load_dicts()
        data = []
        for content in content_list:
            data.append(self.seg_sentence(content))
        scores = []
        result = []
        for i in data:
            scores.append(self.cal_score(i))
        for score in scores:
            result.append(self.res(score))
        return scores, result

In [15]:
sent = Sentiment()
scores, result = sent.run()
jnd = pd.read_csv('jiangnandao.csv')
jnd['scores'] = scores
jnd['res'] = result
jnd.to_csv('jnd_sent.csv',encoding="utf_8_sig")

In [21]:
jnd

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,店铺名称,用户名,评论,scores,res
0,28784,0,江南道自助烤肉海鲜涮锅（柳浪湾店）,vZs846144283,在西财那边和新尚天地这里的都吃过，味道真的一绝，特别是里面各种肉类，超级美味，寝室团建我们可...,14.800,1
1,28785,1,江南道自助烤肉海鲜涮锅（柳浪湾店）,HeS153085382,人超级超级多，所以服务完全是马马虎虎，六点左右去都已经没有里面的位置了，只是外面的，位置很打...,11.875,1
2,28786,2,江南道自助烤肉海鲜涮锅（柳浪湾店）,qsl522523970,就是距离太远了，平时也不好聚，真的强烈推荐性价比超级高，就是晚了就要等很久排队，毕竟生意好，...,5.000,1
3,28787,3,江南道自助烤肉海鲜涮锅（柳浪湾店）,qsl522523970,很不错，指着自己喜欢的来吃，非常不错，肥瘦相间配生菜，还有花甲，生蚝扇贝都是最喜欢的，然后还...,9.000,1
4,28788,4,江南道自助烤肉海鲜涮锅（柳浪湾店）,qsl522523970,每次都会来吃，跟室友跟男朋友， 【口味】口味都很nice 【环境】环境也非常不错，特别热闹 ...,5.000,1
...,...,...,...,...,...,...,...
5533,35432,2775,江南道自助烤肉海鲜涮锅（柳浪湾店）,xiao78044434,味道一般！菜品蛮多！环境比较窄⋯⋯感觉挤挤的,0.000,0
5534,35433,2776,江南道自助烤肉海鲜涮锅（柳浪湾店）,rNR362160129,开心，棒棒哒，培根和鸡肉很棒，吃烧烤就是要吃鸡肉，烤得很快还好吃,4.500,1
5535,35434,2777,江南道自助烤肉海鲜涮锅（柳浪湾店）,白莲终结者,还行，价格公道，小吃不错，冰激凌也棒棒哒,1.500,1
5536,35435,2778,江南道自助烤肉海鲜涮锅（柳浪湾店）,rsO675186865,美团便宜几块钱，好！ 江南道就是不美团也挺划算的，味道也好吃。,8.400,1


In [18]:
jnd1 = jnd[jnd['res']==1]
jnd0 = jnd[jnd['res']==0]

In [19]:
jnd1.to_csv('jnd_pos.csv',encoding="utf_8_sig")
jnd_1.to_csv('jnd_neg.csv',encoding="utf_8_sig")
jnd0.to_csv('jnd_zx.csv',encoding="utf_8_sig")

5538