In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

import sys
sys.path.append("..")
from models import lstm,cnn

# 数据统计

In [24]:
import preprocess

base_path = r"F:\mystyle\git\Sentiment-Analysis\data\ChnSentiCorp_htl_ba_2000"
data, label = preprocess.read_data(base_path)

In [25]:
print("Data Count : {0} ,Label Count : {1}".format(len(data),len(label)))
emotion = dict()
for v in label:
    emotion.setdefault("neg", 0)
    emotion.setdefault("pos", 0)
    emotion[v] += 1
    
    
print("Neg Count : {}".format(emotion["neg"]))
print("Pos Count : {}".format(emotion["pos"]))

Data Count : 1953 ,Label Count : 1953
Neg Count : 964
Pos Count : 989


# 数据预处理

## 替换\n\t

In [26]:
data[0]

'标准间太差 房间还不如3星的 而且设施非常陈旧.建议酒店把老的标准间从新改善.\n\n\n\n'

In [27]:
for i,doc in enumerate(data):
    doc = doc.replace("\n"," ")
    doc = doc.replace("\t"," ")
    data[i] = doc

In [28]:
data[0]

'标准间太差 房间还不如3星的 而且设施非常陈旧.建议酒店把老的标准间从新改善.    '

## 去掉标点符号

In [29]:
import re
from zhon.hanzi import punctuation as ch_p
from string import punctuation as en_p

for i,doc in enumerate(data):
    doc = re.sub(r"[{0}{1}]+".format(ch_p, en_p)," ", doc)
    doc = doc.strip()
    data[i] = doc

In [30]:
data[0]

'标准间太差 房间还不如3星的 而且设施非常陈旧 建议酒店把老的标准间从新改善'

## 分词

In [31]:
import jieba

for i,doc in enumerate(data):
    cut_doc = jieba.cut(doc)
    cut_doc = " ".join(cut_doc)
    data[i] = cut_doc


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.034 seconds.
Prefix dict has been built succesfully.


In [32]:
data[0]

'标准间 太 差   房间 还 不如 3 星 的   而且 设施 非常 陈旧   建议 酒店 把 老 的 标准间 从 新 改善'

## 去掉停用词

In [33]:
filename = r"F:\mystyle\git\Sentiment-Analysis\data\stopWord.txt"
stop_words = list()
with open(filename,"r",encoding="utf-8") as f:
    for line in f:
        line = line.replace("\n", "")
        stop_words.append(line)

In [34]:
for i,doc in enumerate(data):
    words = doc.split(" ")
    temp = []
    for word in words:
        if word in stop_words:
            continue
        temp.append(word)
    data[i] = " ".join(temp)

In [16]:
data[0]

'标准间 太 差 房间 星 设施 陈旧 建议 酒店 标准间 新 改善'

# TF-IDF + RF

## TFIDF

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v = TfidfVectorizer(lowercase=False, max_features = 200)
tfidf_matrix = tfidf_v.fit_transform(data)

## 分离训练集和验证集

In [101]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix,label,test_size = 0.1)

print("Train Count : {}".format(X_train.shape[0]))
print("Test Count : {}".format(X_test.shape[0]))

Train Count : 1757
Test Count : 196


## RF

In [114]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 150
max_features = "sqrt"

rfc = RandomForestClassifier(n_estimators=n_estimators,max_features=max_features)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [115]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true = rfc.predict(X_test),y_pred = y_test)

0.8877551020408163

# LSTM

## 建立及转化词典

In [19]:
word2vocab = preprocess.create_vocab(data)

temp = list()
for sentence in data:
    sent_index = list()
    for word in sentence.split():
        index = word2vocab[word]
        sent_index.append(index)
    temp.append(sent_index)
        

## 获取最大长度的句子

In [26]:
# 获取最大长度的句子
max_sentence = max(temp, key = lambda x: len(x))
max_sentence_len = len(max_sentence)
max_sentence_len

394

## 统一长度

In [27]:
sentence_len =  120

for i,sentence_indexes in enumerate(temp):
    sentence_indexes.extend([0] * sentence_len)
    sentence_indexes = sentence_indexes[:sentence_len]

    temp[i] = sentence_indexes

In [29]:
data = np.asarray(temp)
data.shape

(1953, 120)

## 建立模型

In [49]:
from imp import reload
reload(lstm)

voc_size = len(word2vocab)
lstm_model = lstm.create_lstm(voc_size)
lstm_model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 200)         2381000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 120)               154080    
_________________________________________________________________
dense_6 (Dense)              (None, 80)                9680      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 81        
Total params: 2,544,841
Trainable params: 2,544,841
Non-trainable params: 0
_________________________________________________________________


In [50]:
lstm_model.compile(optimizer=tf.train.AdamOptimizer(),loss="binary_crossentropy",metrics=["accuracy"])

## 分离训练集验证集

In [51]:
import sklearn
from sklearn.model_selection import train_test_split

# pos转化为1，neg为0
temp_label = list()
for sub_label in label:
    v = 1 if sub_label == "pos" else 0
    temp_label.append(v)
label = temp_label
    
X_train, X_test, y_train, y_test = train_test_split(data,label,test_size = 0.1)

## 训练

In [52]:
lstm_model.fit(X_train,y_train,epochs=40,batch_size=512,validation_data=(X_test,y_test))

Train on 1757 samples, validate on 196 samples
Epoch 1/40

Epoch 2/40

Epoch 3/40

Epoch 4/40

Epoch 5/40

Epoch 6/40

Epoch 7/40

Epoch 8/40

Epoch 9/40

Epoch 10/40

Epoch 11/40

Epoch 12/40

Epoch 13/40

Epoch 14/40

Epoch 15/40

Epoch 16/40

Epoch 17/40

Epoch 18/40

Epoch 19/40

Epoch 20/40

Epoch 21/40

Epoch 22/40

Epoch 23/40

Epoch 24/40

Epoch 25/40

Epoch 26/40

Epoch 27/40

Epoch 28/40

Epoch 29/40

Epoch 30/40

Epoch 31/40

Epoch 32/40

Epoch 33/40

Epoch 34/40

Epoch 35/40

Epoch 36/40

Epoch 37/40

Epoch 38/40

Epoch 39/40

Epoch 40/40



<tensorflow.python.keras._impl.keras.callbacks.History at 0x1a4ae320>

# CNN

## 建立词的映射

In [35]:
word2vocab = preprocess.create_vocab(data)

temp = list()
for sentence in data:
    sent_index = list()
    for word in sentence.split():
        index = word2vocab[word]
        sent_index.append(index)
    temp.append(sent_index)

## 规整长度

In [36]:
sentence_len =  120

for i,sentence_indexes in enumerate(temp):
    sentence_indexes.extend([0] * sentence_len)
    sentence_indexes = sentence_indexes[:sentence_len]

    temp[i] = sentence_indexes

In [37]:
data = np.asarray(temp)
data.shape

(1953, 120)

## 建立模型

In [45]:
from imp import reload
reload(cnn)

voca_size = len(word2vocab)
cnn_model = cnn.create_cnn(voca_size)
cnn_model.compile(optimizer=tf.train.AdamOptimizer(),loss="binary_crossentropy",metrics=["accuracy"])
cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 200)         2381000   
_________________________________________________________________
conv1d_9 (Conv1D)            (None, None, 4)           4004      
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 4)                 0         
_________________________________________________________________
dense_17 (Dense)             (None, 80)                400       
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 81        
Total params: 2,385,485
Trainable params: 2,385,485
Non-trainable params: 0
_________________________________________________________________


In [43]:
data.shape

(1953, 120)

In [48]:
batch = 128
epoch = 30

# pos转化为1，neg为0
temp_label = list()
for sub_label in label:
    v = 1 if sub_label == "pos" else 0
    temp_label.append(v)
label = temp_label
    
cnn_model.fit(
    x=data,
    y =label,
    batch_size=batch,
    epochs=epoch,
    shuffle=True,
    validation_split=0.1
)

Train on 1757 samples, validate on 196 samples
Epoch 1/30

Epoch 2/30

Epoch 3/30

Epoch 4/30

Epoch 5/30

Epoch 6/30

Epoch 7/30

Epoch 8/30

Epoch 9/30

Epoch 10/30

Epoch 11/30

Epoch 12/30

Epoch 13/30

Epoch 14/30

Epoch 15/30

Epoch 16/30

Epoch 17/30

Epoch 18/30

Epoch 19/30

Epoch 20/30

Epoch 21/30

Epoch 22/30

Epoch 23/30

Epoch 24/30

Epoch 25/30

Epoch 26/30

Epoch 27/30

Epoch 28/30

Epoch 29/30

Epoch 30/30



<tensorflow.python.keras._impl.keras.callbacks.History at 0x17170710>