## 导入必要的库

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
# 为了在Jupyter Notebook中显示图片的必须配置项
%matplotlib inline
# 用兼容字体解决中文显示为方格的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'

# #苹果用户 
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

## 爬取图书图片（红楼梦）

In [None]:
#请求头信息
my_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
             }
#红楼梦图书图片
HLMPageURL = "https://book.douban.com/subject/1007305/"
response = requests.get(HLMPageURL, headers=my_headers)
HLMPageHTML = BeautifulSoup(response.text, "html.parser")

#红楼梦图片的URL 
HLMPictureURL = HLMPageHTML.find_all("div",id="mainpic")[0].find("a")["href"]
#获取图片
HLMPictureData = requests.get(HLMPictureURL, headers=my_headers)

HLMPictureFileName = "HLM.jpg"
# 保存图片到文件
with open(HLMPictureFileName, mode="wb") as pictureFile:             #wb:write+binary,覆盖方式+二进制模式
    pictureFile.write(HLMPictureData.content)

print("图片文件保存成功！")

## 爬取图书评论（红楼梦）

In [None]:
#请求头信息
my_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
             "Referer":"https://book.douban.com/top250?start=0"}
#红楼梦书评
FirstPageURL = "https://book.douban.com/subject/1007305/"
response = requests.get(FirstPageURL, headers=my_headers)
CommentPageHTML = BeautifulSoup(response.text, "html.parser")

#总评论数目 
AllCommentCount = int(re.findall("\d+(?:\.\d+)?",CommentPageHTML.find_all("div",class_="mod-hd")[0].find_all("span",class_="pl")[0].find("a").text)[0])
#所有评论页面，但豆瓣网评只允许下载前220条，后续需要登陆才可以下载
AllCommentPageURLList=[FirstPageURL + "comments/?start={}&limit=20&status=P&sort=new_score".format(PageNumber) for PageNumber in range(0,220,20)]

#更新Referer
my_headers["Referer"]="https://book.douban.com/subject/1007305/"
AllCommentPageURLList[0]="https://book.douban.com/subject/1007305/comments/"


AllCommentsContent = []              #评论数组
AllCommentsDateTime = []             #评论时间
AllCommenters = []                   #评论人 
filename = "douban_HLM_Comments.csv"     #保存评论的csv文件       

#开始下载评论
PageCount = 1          #第一页
for EachURL in AllCommentPageURLList:
    response = requests.get(EachURL, headers=my_headers)
    
    # 不成功获取就退出
    if response.status_code !=200:
        break;
    
    my_headers["Referer"]=EachURL        #更新Referer，以便于提取下一页
    CommentPageHTML = BeautifulSoup(response.text, "html.parser")

    # 评论内容
    AllCommentContentList = CommentPageHTML.find_all("div",class_="comment")         
    AllCommentsContent.extend([EachElement.find("span",class_="short").text for EachElement in AllCommentContentList])
    # 评论时间
    AllCommentDateTimeList = CommentPageHTML.find_all("a",class_="comment-time")               
    AllCommentsDateTime.extend([EachElement.text for EachElement in AllCommentDateTimeList])

    # 评论人
    AllCommentersList = CommentPageHTML.find_all("span",class_="comment-info")              
    AllCommenters.extend([EachElement.find("a").text for EachElement in AllCommentersList]) 

    # 输出状态信息
    print("网页反馈状态：{};第{}页20条评论下载成功。".format(response.status_code,PageCount))
    PageCount += 1

#保存评论到文件
with open(filename, mode="wt",encoding="utf-8",newline="") as bookfile:             #wt:write+text,覆盖方式+文本模式
    bookwriter = csv.writer(bookfile)
    bookwriter.writerow(["评论人", "评论时间", "评论内容",])
    for CommentContent,CommentDateTime,Commenter in zip(AllCommentsContent,AllCommentsDateTime,AllCommenters):                                                       #图书名字
        CommentContent = CommentContent.strip().replace(" ","").replace("\n","").replace("\r","")   #去掉空格
        bookwriter.writerow([Commenter, CommentDateTime, CommentContent,])
print("评论文件保存成功！")

## 图片词云分析

In [None]:
!pip install wordcloud -i https://pypi.doubanio.com/simple

### 打开评论文件

In [None]:
filename = "douban_HLM_Comments.csv"
Commentsdata = pd.read_csv(filename, encoding="utf-8")
Commentsdata.head()

### 查看数据信息

In [None]:
Commentsdata.shape

In [None]:
Commentsdata.info()
Commentsdata.T.info()

可以看出评论内容中Non-Null元素为219个，存在一个Null，找出来看看

### 数据清洗

In [None]:
Commentsdata.isnull().any()     #查找那一列有NULL
Commentsdata.isnull().T.any()     #查找那一行有NULL
Commentsdata.loc[Commentsdata.isnull().T.any()]

In [None]:
Commentsdata.loc[Commentsdata.isnull().T.any(),"评论内容"]=""       #替换掉NAN为空
Commentsdata.loc[90:97]

### 词云分析

In [None]:
from wordcloud import WordCloud

In [None]:
# 第一种：从文件中获取文本
# filename = "Migration.txt"
# with open(filename, encoding="utf-8") as fileHandle:
#     AllCommentsText = fileHandle.read()
# 第二种：构建文本
AllCommentsText = """
文案 文案The 抱抱 Zen of LOVE 抱抱 Python, 快乐 by Tim Peters公众号 公众号 Python 最好的 
语言 语言一辈子 is better LOVE than 一辈子.喵小姐 is 爱你 than implicit.爱你 喵小姐蟹先生 is 爱你 
than complex.一辈子 is 蟹先生 than complicated.二中 is 喵小姐 我想你了 than nested. 二中 蟹先生清湖 
is 胜于 than 清湖.思旺 counts. 想你Special 喵小姐 我想你了 aren"t special enough 思旺 break 思旺 rules.别生气 
practicality beats 厨艺好.Errors should 我想你了 never pass 小龙虾 silently. 运营别生气 explicitly 好不好.
LOVEIn the face of ambiguity, 程序员 the 厨艺好 to guess.龙华 龙华There 快乐 should be one-- 
我想你了 and preferably 红烧肉 only one 小龙虾--obvious way to do it.运营Although 共享单车 way may not 
我想你了 be obvious at first unless you"re Dutch. 新媒体 地铁Now is better 红烧肉 than never.程序员 
Although 共享单车 is often 高铁 than 东莞 now. 高铁 地铁If the implementation 想你 is hard to explain, 
it"s a bad idea. 想你了If 成都 implementation is 想你 easy to explain, it may be a good idea.Namespaces are 
端午one 端午 honking great idea -- 成都 do more of those! 想你了深圳 晚安 深圳 新媒体
"""
# 第三种，从pandas列数据连接而成
AllCommentsText = " ".join(list(Commentsdata["评论内容"]))

CommentsWordCloud = WordCloud(collocations=False, font_path="msyh.ttc", width=2000, height=1400, margin=2).generate(AllCommentsText)
plt.figure(figsize=(16,12))
plt.imshow(CommentsWordCloud, interpolation="bilinear")
plt.axis("off")
CommentsWordCloud.to_file("HLM.png") 

### 中文分词

In [None]:
!pip install jieba -i https://pypi.doubanio.com/simple

In [None]:
import jieba

In [None]:
AllCommentsText1 = " ".join(jieba.lcut(AllCommentsText))
AllCommentsText1

In [None]:
CommentsWordCloud = WordCloud(collocations=False, font_path="msyh.ttc", width=2000, height=1400, margin=2).generate(AllCommentsText1)
plt.figure(figsize=(16,12))
plt.imshow(CommentsWordCloud, interpolation="bilinear")
plt.axis("off")
CommentsWordCloud.to_file("HLM1.png") 

### 处理停用词

In [None]:
filename = "中文停用词库.txt"
with open(filename, mode='r+', encoding='gbk') as fileHandle:
    StopWordsList = fileHandle.read().split("\n")
StopWordsList

In [None]:
AllCommentsGenerator = jieba.cut(AllCommentsText)
AllCommentsText2 = []
for word in AllCommentsGenerator:
    if (word.strip() not in StopWordsList) and len(word.strip())>1:           #没在停用词里面且不是单字
        AllCommentsText2.append(word)
AllCommentsText2 = " ".join(AllCommentsText2)
AllCommentsText2

In [None]:
CommentsWordCloud = WordCloud(collocations=False, font_path="msyh.ttc", width=2000, height=1400, margin=2).generate(AllCommentsText2)
plt.figure(figsize=(16,12))
plt.imshow(CommentsWordCloud, interpolation="bilinear")
plt.axis("off")
CommentsWordCloud.to_file("HLM2.png") 

## 情感分析

### 安装包

In [None]:
!pip install snownlp -i https://pypi.tuna.tsinghua.edu.cn/simple

### 导入数据包

In [None]:
from snownlp import SnowNLP, sentiment

### 情感分析

In [None]:
negSentences = "疫情依然很严重，又没有抢到菜，心情糟糕透了！"
posSentences = "阿加莎之外的另一位犯罪推理小说的大师级女作家！女作家和男作家的区别，在于擅长在逻辑严密和气氛紧张的犯罪推理之外，对人性的软弱、复杂和彼此之间可堪或不堪的关系有着细腻的观察和丰富的细节表现。小说更对女性在“法医”这一“男性”职业之种种身心压力、人际压力、规则压力的包围下，如何坚定地维护正义、呵护亲情和爱情，进行了细致入微地表现"
EmotionData = SnowNLP(negSentences)
print(EmotionData.words)
print(EmotionData.sentiments)
print(list(EmotionData.tags))
print(EmotionData.sentiments)

In [None]:
#只提取中文的示例代码
item = """
        from snownlp import SnowNLP
        import pandas as pd
        import re
        df = pd.read_excel('评论数据.xlsx')
        content = df['评论内容']
        # 去除一些无用的字符,只提取出中文出来
        content = [' '.join(re.findall('[\u4e00-\u9fa5]+', item, re.S)) for item in content]
        # 对每条评论进行情感打分
        scores = [SnowNLP(i).sentiments for i in content]
        emotions = []
        # 根据分数来划定好评 中评 差评
        for i in scores:
            if i >= 0.75:
                emotions.append('好评')
            elif 0.45 <= i < 0.75:
                emotions.append('中评')
            else:
                emotions.append('差评')
        df['情感分数'] = scores
        df['情感'] = emotions
        df.to_excel('NLP测试后数据.xlsx')
        """
re.findall('[\u4e00-\u9fa5]+', item, re.S)

**词性解释**  
a/形容词 c/连词 d/副词 n/名词 p/介词 r/代词 v/动词  
np/人名 ns/地名 ni/机构名 nz/其它专名  
m/数词 q/量词 mq/数量词  
t/时间词 f/方位词 s/处所词  
h/前接成分 k/后接成分  
i/习语 j/简称  
y/语气助词 u/助词 e/叹词 o/拟声词 g/语素 w/标点 x/其它 

In [None]:
Commentsdata["评论内容"]

In [None]:
SelectedComment = Commentsdata["评论内容"][3]
SelectedComment

In [None]:
EmotionData = SnowNLP(SelectedComment)
print(EmotionData.words)
print(list(EmotionData.tags))
print(EmotionData.sentiments)

In [None]:
Commentsdata

In [None]:
Commentsdata["情感指数"] = Commentsdata["评论内容"].map(lambda x: SnowNLP(x).sentiments if x!= "" else 0)

# def ComputeSentiments(CommentsData):
#     return SnowNLP(CommentsData["评论内容"]).sentiments
    
# CommentsData["情感指数"] = CommentsData.apply(ComputeSentiments, axis=1)

In [None]:
Commentsdata.loc[90:97]

In [None]:
plt.figure(figsize=(20,8),dpi=80)
plt.hist(Commentsdata["情感指数"], bins=10, color="Steelblue", edgecolor="k",label="直方图")

# 标题和坐标轴说明
plt.xlabel("情感指数",fontsize=20)
plt.ylabel("人数",fontsize=20)
plt.title("情感指数分布图",fontsize=20)

### 训练自己的模型

In [None]:
sentiment.train("neg.txt","pos.txt")
sentiment.save("my_model.marshal")

### 装载模型

In [None]:
import snownlp
print(snownlp.__file__)