In [1]:
import pandas as pd
import numpy as np
import jieba
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report,roc_auc_score,accuracy_score
from sklearn.model_selection import *
from sklearn.utils import shuffle 
# 导入数据
data = pd.read_csv('data/waimai_10k.csv')
data = shuffle(data)#将数据打乱
data.tail()

Unnamed: 0,label,review
9372,0,少送一个米饭
1909,1,还可以，包装干净卫生
8391,0,十一点40多时说15分钟后到，结构一直等到快一点。
46,1,"态度很好,地址填错了还是给我跑了一趟,没有表现出不愿意的样子,为了这个快递员,我写了评论"
6642,0,忍无可忍，在中国电子大厦送这么久


In [2]:
# !pip install gensim -i https://pypi.doubanio.com/simple/  --trusted-host pypi.doubanio.com

In [3]:
# 去除英文与数字，符号,采用replace方法
data.review = data.review.str.replace('[0-9a-zA-Z]','')
#文本分词

with open("data/stop_words",encoding='utf-8') as words:
    stop_word = [i.strip() for i in words.readlines()]

def word_cut(word_data):
    w = [i for i in jieba.cut(word_data,cut_all=False) if i not in stop_word]
    result = ' '.join(w)
    return result

data['review'] = data.review.apply(word_cut)
data.head()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 0.715 seconds.
Prefix dict has been built successfully.


Unnamed: 0,label,review
3678,1,送餐 服务 热情 美味
8786,0,总体 送餐 时间 快 干炸 小黄鱼 梧桐 花 鸡蛋 炒 韭菜 味道 东边 拉皮 辣 备注 信...
10530,0,菜 淡 味 就肠 还行
1373,1,送餐 哥们 特
379,1,点 三个 菜 师傅 拿来 六 盒子 量 很大 千叶 豆腐 包菜 咸了 点 鱼香肉丝 咸淡 刚...


In [4]:
# 训练词向量
word2 = Word2Vec(data.review) 
word2.save("model/word2.model")

In [5]:
word2

<gensim.models.word2vec.Word2Vec at 0xb1e6c18>

In [6]:
def total_vector(words):
    vec = np.zeros(100).reshape((1, 100))
    for word in words:
        try:
            vec += word2.wv[word].reshape((1, 100))
        except KeyError:
            continue
    return vec

In [7]:
train_vec = np.concatenate([total_vector(words) for words in data.review])

In [8]:
train_vec

array([[ -0.24263305,   0.61289017,  -0.03686696, ...,  -3.72922665,
         -2.4413017 ,   5.87703164],
       [ 13.18821565, -17.65611641,  15.80567531, ...,  -5.41617816,
        -18.22524045,   6.47902302],
       [  1.67405354,  -0.26685781,   1.07394729, ...,   0.48155849,
         -3.5358381 ,   0.29535586],
       ...,
       [  2.40132083,  -3.79045035,   7.22312622, ...,  -3.42341136,
          4.47148358,   3.39113457],
       [  3.75973833,  -0.94046687,   0.75299377, ...,  -9.34318598,
         -2.8327899 ,  12.05879644],
       [  1.9124626 ,  -0.89869567,   1.82489125, ...,  -4.41543982,
          0.35172408,   4.89323728]])

In [9]:
x_train,x_test,y_train,y_test = train_test_split(train_vec,data.label,test_size=0.3,random_state=0)


In [10]:
svm_model = svm.SVC()
svm_model.fit(x_train,y_train)
y_predict = svm_model.predict(x_test)
y_predict

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [11]:
print('SVM模型的准确率为：\n',accuracy_score(y_test, y_predict))
print('SVM模型的评估报告：\n',classification_report(y_test, y_predict))

SVM模型的准确率为：
 0.7984431470670003
SVM模型的评估报告：
              precision    recall  f1-score   support

          0       0.79      0.94      0.86      2377
          1       0.82      0.52      0.64      1220

avg / total       0.80      0.80      0.78      3597



In [12]:
def svm_predict(query):
    words = jieba.lcut(str(query))
    words_vec = total_vector(words)
    result = svm_model.predict(words_vec)
    if int(result) == 1:
        print('类别：好评')
    elif int(result) == 0:
        print('类别：差评')

In [13]:
svm_predict("外卖，师傅都一级棒。师傅还主动帮忙帮我把垃圾带下去，太贴心了")

类别：差评


In [14]:
svm_predict("好吃，味道很好")

类别：好评
