In [1]:
import numpy as np
X = np.load('datasets/delivery_review.npy')
y = np.load('datasets/delivery_label.npy')

In [2]:
with open('datasets/chinese_stop_words.txt', encoding='utf-8') as text:
    stopwords = [line.strip() for line in text]

In [3]:
import jieba
X_new = []
for review in X:
    review = list(jieba.cut(review))
    result = []
    for word in review:
        if word not in stopwords:
            result.append(word)
    X_new.append(" ".join(result))
X_new = np.array(X_new)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\BEIZHO~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.843 seconds.
Prefix dict has been built succesfully.


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, shuffle=True)

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
maxlen = max([len(sentence.split(" ")) for sentence in X_new])
words_size = 10000
tokenizer = Tokenizer(num_words=words_size)
tokenizer.fit_on_texts(X_new)
sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(sequences, 
                        maxlen=maxlen, 
                        padding='pre')
sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(sequences, 
                       maxlen=maxlen, 
                       padding='pre')

Using TensorFlow backend.


In [6]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense 
from keras.optimizers import Adam
embedding_dim = 32
model = Sequential()
model.add(Embedding(input_dim=words_size, 
                    input_length=maxlen,
                    output_dim=embedding_dim))
model.add(Bidirectional(LSTM(units=64,
                             return_sequences=True, 
                             recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(units=128,
                             return_sequences=False, 
                             recurrent_dropout=0.2)))
model.add(Dropout(0.2))
model.add(Dense(units=1, 
                activation='sigmoid'))
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 135, 32)           320000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 135, 128)          49664     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 633,089
Trainable params: 633,089
Non-trainable params: 0
_______________

In [7]:
model.compile(optimizer=Adam(),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, 
          y_train, 
          epochs=5, 
          batch_size=32, 
          verbose=2,
          validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 5120 samples, validate on 1280 samples
Epoch 1/5
 - 197s - loss: 0.5023 - acc: 0.7469 - val_loss: 0.3823 - val_acc: 0.8492
Epoch 2/5
 - 187s - loss: 0.3043 - acc: 0.8762 - val_loss: 0.3834 - val_acc: 0.8289
Epoch 3/5
 - 195s - loss: 0.2266 - acc: 0.9139 - val_loss: 0.3796 - val_acc: 0.8422
Epoch 4/5
 - 178s - loss: 0.1762 - acc: 0.9367 - val_loss: 0.4203 - val_acc: 0.8313
Epoch 5/5
 - 193s - loss: 0.1453 - acc: 0.9502 - val_loss: 0.5071 - val_acc: 0.8227


<keras.callbacks.History at 0x1d8ae743a90>