In [1]:
import numpy as np
X = np.load('datasets/delivery_review.npy')
y = np.load('datasets/delivery_label.npy')

In [2]:
with open('datasets/chinese_stop_words.txt', encoding='utf-8') as text:
    stopwords = [line.strip() for line in text]

In [3]:
import jieba
X_new = []
for review in X:
    review = list(jieba.cut(review))
    result = []
    for word in review:
        if word not in stopwords:
            result.append(word)
    X_new.append(" ".join(result))
X_new = np.array(X_new)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\BEIZHO~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.952 seconds.
Prefix dict has been built succesfully.


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, shuffle=True)

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
maxlen = max([len(sentence.split(" ")) for sentence in X_new])
words_size = 10000
tokenizer = Tokenizer(num_words=words_size)
tokenizer.fit_on_texts(X_new)
sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences, 
                        maxlen=maxlen, 
                        padding='pre')
sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences, 
                       maxlen=maxlen, 
                       padding='pre')

Using TensorFlow backend.


In [6]:
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Flatten, Activation
from keras.layers import RepeatVector, Permute, Multiply, Lambda
from keras.optimizers import Adam
import keras.backend as K
# 指定词向量的长度
embedding_dim = 32
# 使用128个长短期记忆单元
n_units = 128
inputs = Input((maxlen,))
embeddings = Embedding(input_dim=words_size, 
                       input_length=maxlen,
                       output_dim=embedding_dim)(inputs)
# 双向长短期记忆层
lstm_outputs = Bidirectional(LSTM(n_units, 
                                  return_sequences=True))(embeddings) 
# 注意力的构建
# 全连接层
attention = Dense(1, activation='tanh')(lstm_outputs)
# 扁平化层
attention = Flatten()(attention) 
# Softmax函数的使用
attention = Activation('softmax')(attention) 
attention = RepeatVector(n_units * 2)(attention) 
attention = Permute([2, 1])(attention) 
# 将注意力应用在双向长短期记忆单元的输出
result = Multiply()([lstm_outputs, attention]) 
result = Lambda(lambda x: K.sum(x, axis=-2))(result)
# 模型的输出
outputs = Dense(1, activation='sigmoid')(result)
model = Model(inputs, outputs)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 135)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 135, 32)      320000      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 135, 256)     164864      embedding_1[0][0]                
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 135, 1)       257         bidirectional_1[0][0]            
_____________________________________

In [7]:
model.compile(optimizer=Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(X_train,
          y_train,
          epochs=5,
          batch_size=32,
          verbose=2,
          validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 5120 samples, validate on 1280 samples
Epoch 1/5
 - 98s - loss: 0.6263 - acc: 0.6123 - val_loss: 0.4949 - val_acc: 0.7844
Epoch 2/5
 - 93s - loss: 0.5041 - acc: 0.7648 - val_loss: 0.5451 - val_acc: 0.7477
Epoch 3/5
 - 94s - loss: 0.4711 - acc: 0.7928 - val_loss: 0.4961 - val_acc: 0.7719
Epoch 4/5
 - 102s - loss: 0.3853 - acc: 0.8396 - val_loss: 0.4758 - val_acc: 0.7773
Epoch 5/5
 - 95s - loss: 0.3199 - acc: 0.8725 - val_loss: 0.4499 - val_acc: 0.7945


<keras.callbacks.History at 0x149010e4c50>