## 数据读入与数据预处理

In [41]:
# 导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import json
import time 
import requests

### 评论数据处理

In [44]:
# 读入数据
df_comment_1 = pd.read_excel('./jinglei1.xlsx')
df_comment_2 = pd.read_excel('./jinglei2.xlsx') 

# 合并数据
df_comment = pd.concat([df_comment_1, df_comment_2])

# 去除重复值
df_comment.drop_duplicates(inplace=True)
print(df_comment.info()) 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2020 entries, 0 to 1009
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    2020 non-null   int64 
 1   userid        2020 non-null   int64 
 2   nick_name     2020 non-null   object
 3   comment_id    2020 non-null   object
 4   content_time  2020 non-null   int64 
 5   liked_Count   2020 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 110.5+ KB
None


content_time：转换为标准时间

In [45]:
def timeStamp(timeNum): 
    '''功能：转换毫秒为标准时间'''
    timeStamp = float(timeNum/1000)  # 转换为秒
    timeArray = time.localtime(timeStamp) 
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) # 转换字符串
    return otherStyleTime

In [46]:
df_comment['content_time'] = df_comment['content_time'].apply(lambda x:timeStamp(x)) 
df_comment.head() 

Unnamed: 0.1,Unnamed: 0,userid,nick_name,comment_id,content_time,liked_Count
0,0,427497354,U-mccord,宝批龙,2020-04-29 23:44:42,0
1,1,2118972649,惟爱青荷,这唱歌的让人锤着蛋了吧？怎么这样……,2020-04-29 23:43:19,0
2,2,67118130,ishidaxwf,说真的，杨坤版本的比这唱的好听多了,2020-04-29 23:36:29,0
3,3,3304592439,做人要低调无涯,猛然一听猛如虎，仔细一听二百五,2020-04-29 23:33:21,0
4,4,575246196,说梦_,好活[强],2020-04-29 23:31:05,1


In [48]:
# 异常值处理
df_comment['content'] = df_comment['content'].replace('🕴🏿🕴🏿⚰️🕴🏿🕴🏿', '黑人抬棺') 

KeyError: 'content'

百度api情感分析

- 计算积极评分和消极评分值
- 计算正负向标签

In [49]:
# 输入API Key和Secret Key
ak = '675d471b26274adcbf59dc899645cbee'
sk = '3781789ed34e4c2a8ef176ab08bb2d79'

host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(ak, sk)
https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=675d471b26274adcbf59dc899645cbee&client_secret=3781789ed34e4c2a8ef176ab08bb2d79
# 发起请求
r = requests.post(host) 
# 获取token
token = r.json()['access_token']

def get_sentiment_score(text):
    """
    输入文本，返回情感倾向得分
    """
    url = 'https://aip.baidubce.com/rpc/2.0/nlp/v1/sentiment_classify?charset=UTF-8&access_token={}'.format(token)
    data = {
        'text': text
    } 
    data = json.dumps(data)
    try:
        res = requests.post(url, data=data, timeout=3)
        items_score = res.json()['items']
    except Exception as e:
        time.sleep(1) 
        res = requests.post(url, data=data, timeout=3)
        items_score = res.json()['items']
    return items_score

KeyError: 'access_token'

In [27]:
score_list = [] 

step = 0
for i in df_comment['content']:
    score = get_sentiment_score(i)
    # 打印进度
    step += 1
    print('我正在获取第{}个评分'.format(step), end='\r') 
    score_list.append(score)  

NameError: name 'df_comment' is not defined

In [28]:
# 提取正负概率
positive_prob = [i[0]['positive_prob'] for i in score_list]
negative_prob = [i[0]['negative_prob'] for i in score_list]

print(len(positive_prob), len(negative_prob)) 

0 0


In [29]:
df_comment['positive_prob'] = positive_prob
df_comment['negative_prob'] = negative_prob
df_comment.head() 

NameError: name 'df_comment' is not defined

In [30]:
df_comment['score_label'] = df_comment['positive_prob'].apply(lambda x:1 if x>0.5 else -1) 
df_comment.head() 

NameError: name 'df_comment' is not defined

## 数据可视化

1. 评论数时间(按小时)分布
4. 用户年龄分布
5. 用户地区分布
6. 评论情感得分-基于百度自然语言处理API
7. 评论赞数分布和情感关系
8. 评论词云分析

### 评论数时间(按小时)分布

In [31]:
df_comment['content_time'] = pd.to_datetime(df_comment['content_time'])
df_comment['content_date'] = df_comment['content_time'].dt.date
df_comment['content_date'].value_counts() 

NameError: name 'df_comment' is not defined

In [32]:
df_comment['content_hour'] = df_comment.content_time.dt.hour
hour_num = df_comment.content_hour.value_counts().sort_index()
hour_num.head()  

NameError: name 'df_comment' is not defined

In [33]:
# 折线图
from pyecharts.charts import Line
from pyecharts import options as opts 

line1 = Line(init_opts=opts.InitOpts(width='1350px', height='750px'))
line1.add_xaxis(hour_num.index.tolist())
line1.add_yaxis('热度', hour_num.values.tolist(),
                label_opts=opts.LabelOpts(is_show=False))
line1.set_global_opts(title_opts=opts.TitleOpts(title='评论数时间(按小时)分布'),
                      visualmap_opts=opts.VisualMapOpts(max_=80))
line1.set_series_opts(linestyle_opts=opts.LineStyleOpts(width=3)) 
line1.render() 

ModuleNotFoundError: No module named 'pyecharts'

### 评论情感得分

In [34]:
label_num = df_comment.score_label.value_counts() / df_comment.score_label.value_counts().sum()
label_perc = np.round(label_num, 2) 
label_perc.index = ['负向', '正向'] 
label_perc

NameError: name 'df_comment' is not defined

In [35]:
# 绘制饼图
pie2 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))
pie2.add("",
         [*zip(label_perc.index, label_perc.values)],
         radius=["40%","65%"])
pie2.set_global_opts(title_opts=opts.TitleOpts(title='评论情感标签正负向分布'),
                     legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%"),
                     toolbox_opts=opts.ToolboxOpts()) 
pie2.set_series_opts(label_opts=opts.LabelOpts(formatter="{c}%"))
pie2.set_colors(['#3B7BA9', '#EF9050'])
pie2.render() 

NameError: name 'Pie' is not defined

### 情感评论正向得分

In [36]:
# 定义分隔区间
bins = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
       0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
positive_num = pd.cut(df_comment.positive_prob, bins).value_counts()
positive_num = positive_num.sort_index()
positive_num

NameError: name 'df_comment' is not defined

In [37]:
bar3 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px'))
bar3.add_xaxis(positive_num.index.astype('str').tolist())
bar3.add_yaxis("", positive_num.values.tolist(), category_gap='5%')
bar3.set_global_opts(title_opts=opts.TitleOpts(title="评论情感得分"), 
                     visualmap_opts=opts.VisualMapOpts(max_=500),
                     toolbox_opts=opts.ToolboxOpts()
                    )
bar3.render() 

NameError: name 'Bar' is not defined

In [38]:
# 看一看
df_comment[df_comment.positive_prob>=0.95][['content', 'positive_prob']][15:25]  

NameError: name 'df_comment' is not defined

结论：负向评论估计在85%以上。

### 评论情感得分

In [39]:
import jieba
import jieba.analyse

# 合并为一篇
txt = df_comment['content'].str.cat(sep='。')

# 添加关键词
jieba.add_word('惊雷')
jieba.add_word('MC六道')

# 读入停用词表
stop_words = []
with open(r"C:\Users\wzd\Desktop\CDA\CDA_Learning\Python\Python项目实作\电影\豆瓣电影\stop_words.txt", 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        stop_words.append(line.strip())

# 添加停用词
stop_words.extend(['00', '18', '惊雷', '一首', '一首歌']) 

# 评论字段分词处理
word_num = jieba.analyse.extract_tags(txt,
                                      topK=100,
                                      withWeight=True,
                                      allowPOS=())

# 去停用词
word_num_selected = []

for i in word_num:
    if i[0] not in stop_words:
        word_num_selected.append(i)

key_words = pd.DataFrame(word_num_selected, columns=['words','num'])
key_words.head() 

ModuleNotFoundError: No module named 'jieba'

In [40]:
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType

word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px'))
word1.add("", [*zip(key_words.words, key_words.num)],
          word_size_range=[20, 100],
          shape=SymbolType.DIAMOND)
word1.set_global_opts(title_opts=opts.TitleOpts('网易云音乐关于惊雷评论词云'),
                      toolbox_opts=opts.ToolboxOpts(),
                     )
word1.render() 

ModuleNotFoundError: No module named 'pyecharts'