In [2]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle

In [3]:
data = pd.read_csv("data/waimai_10k.csv")
data = shuffle(data)
data.head()

Unnamed: 0,label,review
9975,0,定的辣牛肉汤套餐，那么冷的天·两个小时才到，汤只剩下余温了·我也饿的不行了·太慢太慢太慢了！...
8419,0,今天的饭夹生的…给送餐员加工资！
11867,0,木耳太不新鲜，吃完拉肚子了
1261,1,送餐送的很快，比预计的还早，不错不错
9977,0,"不好吃,都凉了,汤顶上一层大油,送餐时间1个半小时,米饭是夹生的,最难吃的料理,以后再也不定..."


In [4]:
data.label.value_counts()

0    7987
1    4000
Name: label, dtype: int64

In [4]:
# 去除英文与数字，符号,采用replace方法
data.review = data.review.str.replace('[0-9a-zA-Z]','')
data.head()

Unnamed: 0,label,review
6318,0,说好的石锅拌饭呢，差评
8619,0,现在每次都不是准时送达
2020,1,"好实在的卷饼,够值"
9420,0,每次肯定晚至少一个小时，下单了最后没送到，真是醉了，饭也给的死少。不建议大家订餐，如果非要吃...
7286,0,咖啡到了几乎凉了，还撒了好多，以后不会用百度外卖代购咖啡了


In [5]:
#文本分词
import jieba

with open("data/stop_words",encoding='utf-8') as words:
    stop_word = [i.strip() for i in words.readlines()]

    
def word_cut(word_data):
    w = [i for i in jieba.cut(word_data) if i not in stop_word]
    result = ' '.join(w)
    return result

ww = data.review.apply(word_cut)
ww[:5]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 0.702 seconds.
Prefix dict has been built successfully.


6318                                         说好 石锅 拌 饭 差评
8619                                             每次 准时 送达
2020                                            实在 卷 饼 够值
9420    每次 肯定 晚 至少 小时 下单 没 送到 醉 饭 死 少 建议 订餐 非要 吃 提前 订 ...
7286                                咖啡 凉 撒 好多 百度 外卖 代购 咖啡
Name: review, dtype: object

In [6]:

counts = CountVectorizer(min_df=0.01)
# 构建矩阵
dtm_counts = counts.fit_transform(ww).toarray()
columns = counts.get_feature_names()
X = pd.DataFrame(dtm_counts,columns=columns)
Y = data.label
X.head()

Unnamed: 0,一个半,一个多,一份,一点,下单,下次,不到,不好,不错,东西,...,送达,送过来,送错,送餐,速度,配送,难吃,餐员,骑士,鸡肉
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn import model_selection
from sklearn import naive_bayes
from sklearn import metrics
# 将数据集拆分为训练集和测试集
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size = 0.25, random_state=1)
# 构建伯努利贝叶斯分类器
bnb = naive_bayes.BernoulliNB()
gnb = GaussianNB()
# 模型在训练数据集上的拟合
bnb.fit(X_train,y_train)
# 模型在测试数据集上的预测
bnb_pred = bnb.predict(X_test)
# 模型的预测准确率
print('伯努利模型的准确率为：\n',metrics.accuracy_score(y_test, bnb_pred))
print('伯努利模型的评估报告：\n',metrics.classification_report(y_test, bnb_pred))

伯努利模型的准确率为：
 0.7997997997997998
伯努利模型的评估报告：
              precision    recall  f1-score   support

          0       0.83      0.88      0.86      2008
          1       0.73      0.63      0.67       989

avg / total       0.80      0.80      0.80      2997



In [8]:
from sklearn import model_selection
from sklearn import naive_bayes
from sklearn import metrics
# 将数据集拆分为训练集和测试集
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size = 0.25, random_state=1)
gnb = GaussianNB()
# 模型在训练数据集上的拟合
gnb.fit(X_train,y_train)
# 模型在测试数据集上的预测
gnb_pred = gnb.predict(X_test)
# 模型的预测准确率
print('高斯模型的准确率为：\n',metrics.accuracy_score(y_test, gnb_pred))
print('高斯模型的评估报告：\n',metrics.classification_report(y_test, gnb_pred))

高斯模型的准确率为：
 0.6376376376376376
高斯模型的评估报告：
              precision    recall  f1-score   support

          0       0.92      0.50      0.65      2008
          1       0.47      0.91      0.62       989

avg / total       0.77      0.64      0.64      2997



In [9]:
from sklearn import model_selection
from sklearn import naive_bayes
from sklearn import metrics
# 将数据集拆分为训练集和测试集
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size = 0.25, random_state=1)
mnb = naive_bayes.MultinomialNB()
# 模型在训练数据集上的拟合
mnb.fit(X_train,y_train)
# 模型在测试数据集上的预测
mnb_pred = mnb.predict(X_test)
# 模型的预测准确率
print('多项式模型的准确率为：\n',metrics.accuracy_score(y_test, mnb_pred))
print('多项式模型的评估报告：\n',metrics.classification_report(y_test, mnb_pred))

多项式模型的准确率为：
 0.8014681348014682
多项式模型的评估报告：
              precision    recall  f1-score   support

          0       0.83      0.88      0.86      2008
          1       0.73      0.63      0.68       989

avg / total       0.80      0.80      0.80      2997

