In [None]:

################################
##              ##
## 載入與了解IMDB網路電影資料集 ##
##              ##
################################


In [None]:

##載入tensorflow做使用
##tensorflow的tf.keras.dataset.imdb已內建imdb的資料集


In [None]:
import tensorflow as tf

In [None]:

##load_data函數內的參數num_words設定變數top_words，表示要取出資料集中前多少個最常出現的單字，以上面指令而言，就是取出前1,000個單字。


In [None]:
top_words =1000
(train_x,train_y), (test_x,test_y) = tf.keras.datasets.imdb.load_data(num_words=top_words)

In [None]:

##下載後會將資料集的訓練與測試資料分別儲存在(train_x, train_y)、(test_x,test_y)中
##可透過shape指令顯示訓練和測試資料集內各維度的資料數量（也稱為形狀），顯示各訓練與測試資料集的資料數量都是25,000筆：


In [None]:
print("train_x's shape:'{0}".format(train_x.shape))
print("train_y's shape:'{0}".format(train_y.shape))
print("test_x's shape:'{0}".format(test_x.shape))
print("test_y's shape:'{0}".format(test_y.shape))

train_x's shape:'(25000,)
train_y's shape:'(25000,)
test_x's shape:'(25000,)
test_y's shape:'(25000,)


In [None]:

##也可檢視訓練資料集內第1筆評論資料（矩陣索引值從0起算），及其對應的標籤資料：
##下面data的輸出結果顯示為一個整數值的矩陣，這是因為該評論的單字已置換成「單字 - 索引」（Word-index）
##而該索引對應到單字 - 索引字典。標籤的部分，整數1表示正面評價，0表示負面評價。


In [None]:
print("data:'{0}".format(train_x[0]))
print("label:'{0}".format(train_y[0]))

data:'[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
label:'1


In [None]:

##若欲顯示回原始文字，可透過下面技巧進行解碼，將整數陣列顯示回單字。首先利用imdb模組中的get_word_index()函數取得單字索引字典：
##試看看he這個詞後面接的index以及word是什麼?


In [None]:
words_mapping = tf.keras.datasets.imdb.get_word_index()

In [None]:
print("word [he]'s index is:{0}".format(words_mapping["he"]))

word [he]'s index is:26


In [None]:

##接著可以用下面技巧製作「索引 - 單字」（Index-words）字典：


In [None]:
indice_mapping = dict([(value,key) for (key,value) in words_mapping.items()])

In [None]:
print("index [400]'s word is:{0}".format(indice_mapping[400]))
print("index [317]'s word is:{0}".format(indice_mapping[317]))

index [400]'s word is:name
index [317]'s word is:half


In [None]:

##最後就可以將訓練資料集第 1 筆評論資料顯示回單字
##解碼後的結果中，可看到內容中含有一些「？」，表示該單字沒對應的索引。


In [None]:
def decode_review(target):
    return (" ".join([indice_mapping.get(i-3,"?") for i in target]))
decode_review(train_x[0])

"? this film was just brilliant casting ? ? story direction ? really ? the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same ? ? as myself so i loved the fact there was a real ? with this film the ? ? throughout the film were great it was just brilliant so much that i ? the film as soon as it was released for ? and would recommend it to everyone to watch and the ? ? was amazing really ? at the end it was so sad and you know what they say if you ? at a film it must have been good and this definitely was also ? to the two little ? that played the ? of ? and paul they were just brilliant children are often left out of the ? ? i think because the stars that play them all ? up are such a big ? for the whole film but these children are amazing and should be ? for what they have done don't you think the whole story was so ? because it was true and was ? life after all that was ? with us all"

In [None]:

###############
##           ##
## 資料預處理 ##
##           ##
###############


In [None]:

##為了便於神經網路的訓練，接下來要先對評論的內容執行預處理。
##由於我們會將評論以批次的方式傳入輸入層進行神經網路訓練，因此必須先將各筆評論的長度（也就是單字數量）填充或剪裁成相同長度。
##首先檢視前10筆評論內容的長度，從輸出結果可以發現各則評論的字數皆不相同：


In [None]:
for i in range(10):
    print(len(train_x[i]))

218
189
141
550
147
43
123
562
233
130


In [None]:

##接著利用TensorFlow.keras.preprocessing模組提供的”sequence”函式處理長度，參數的設定如下說明：
##  1. sequences：欲處理的資料集，也就是訓練資料集或測試資料集。
##  2. max_len_words：每筆資料的單字數，會將每筆資料依情況填充或剪裁以符合此設定數量，在這次的範例中，我們將評論內容長度設定為100。


In [None]:
from tensorflow.keras.preprocessing import sequence
max_len_word=100
train_x = sequence.pad_sequences(sequences=train_x, maxlen=max_len_word)
test_x = sequence.pad_sequences(sequences=test_x, maxlen=max_len_word)

In [None]:

##執行後可以透過shape查看各維度的資料數量，確認25,000個評論的長度皆為100：


In [None]:
print("train_x's shape:{0}".format(train_x.shape))
print("test_x's shape:{0}".format(test_x.shape))

train_x's shape:(25000, 100)
test_x's shape:(25000, 100)


In [None]:

#############
##         ##
## 建構RNN ##
##         ##
#############


In [None]:

##首先創建RNN模型，模型利用
## 1.TensorFlow.keras.models
## 2.TensorFlow.keras.layers
## 兩個模組所提供的函式來創建：


In [None]:

#下面指令首先創建一個Sequential模型，然後加入Embedding層。
#這層主要負責將評論中每個單字的整數索引轉換為「固定長度的詞向量」，以範例的設定而言，就是將每一個單字的整數索引以維度32的向量來表示。
#請注意，此層一定是Sequential模型的第一層。
#Embedding()函式第1個參數 input_dim設定為特徵的單字數量，也就是一開始設定最常用的1,000個單字
#第2個參數output_dim為輸出詞向量的維度
#第3個參數input_length 為輸入的評論內容長度，即長度100。
#最後使用print(model.summary())查詢目前定義的模型架構

#接著就加入SimpleRNN層，參數設置為Embedding層的輸出維度，最後再加入Dense層，因為結果是正面評價或負面評價，因此只需單顆神經元即可表示。


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense
model = Sequential()
model.add(Embedding(input_dim=top_words,output_dim=32,input_length=max_len_word))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='relu'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           32000     
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 34,113
Trainable params: 34,113
Non-trainable params: 0
_________________________________________________________________
None


In [None]:

#模型建立完成後，接著就是編譯模型、訓練模型，並進行評估的操作：


In [None]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [None]:
history = model.fit(train_x,train_y,validation_split=0.2,epochs=20, batch_size=128,verbose=2)

Epoch 1/20
157/157 - 3s - loss: 0.9548 - accuracy: 0.5353 - val_loss: 0.6687 - val_accuracy: 0.5870 - 3s/epoch - 19ms/step
Epoch 2/20
157/157 - 2s - loss: 0.6344 - accuracy: 0.6332 - val_loss: 0.6241 - val_accuracy: 0.6590 - 2s/epoch - 14ms/step
Epoch 3/20
157/157 - 2s - loss: 0.5685 - accuracy: 0.7124 - val_loss: 0.6180 - val_accuracy: 0.6906 - 2s/epoch - 15ms/step
Epoch 4/20
157/157 - 2s - loss: 0.5098 - accuracy: 0.7717 - val_loss: 0.7667 - val_accuracy: 0.5894 - 2s/epoch - 14ms/step
Epoch 5/20
157/157 - 2s - loss: 0.6256 - accuracy: 0.6515 - val_loss: 0.6539 - val_accuracy: 0.6116 - 2s/epoch - 14ms/step
Epoch 6/20
157/157 - 2s - loss: 0.5827 - accuracy: 0.6957 - val_loss: 0.6588 - val_accuracy: 0.6408 - 2s/epoch - 14ms/step
Epoch 7/20
157/157 - 2s - loss: 0.5601 - accuracy: 0.7245 - val_loss: 0.6447 - val_accuracy: 0.6734 - 2s/epoch - 14ms/step
Epoch 8/20
157/157 - 2s - loss: 0.4849 - accuracy: 0.7768 - val_loss: 0.5960 - val_accuracy: 0.7484 - 2s/epoch - 15ms/step
Epoch 9/20
157/1

In [None]:

#從評估結果可以看到，驗證準確度為77%，循環神經網路表現不是很好，這是因為RNN的記憶力沒有這麼好，處理太長的序列資料會遺忘早期輸入的資料。


In [None]:
loss ,accuracy = model.evaluate(test_x,test_y)



In [None]:

#############
##         ##
## 建構LSTM ##
##         ##
#############


In [None]:

##接著將模型改為LSTM，實作上只要先載入LSTM層模組，接著仿照RNN建立模型的部分，並將RNN層替換成LSTM層即可，其他部分都以類似的邏輯撰寫：


In [None]:
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Embedding,SimpleRNN,Dense
from tensorflow.keras.layers import LSTM

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=top_words,output_dim=32,input_length=max_len_word))
lstm_model.add(LSTM(32))
lstm_model.add(Dense(1, activation='sigmoid'))
print(lstm_model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           32000     
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 40,353
Trainable params: 40,353
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
lstm_model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

In [None]:
lstm_history = lstm_model.fit(train_x,train_y,validation_split=0.2, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
157/157 - 7s - loss: 0.5494 - accuracy: 0.7291 - val_loss: 0.4309 - val_accuracy: 0.8046 - 7s/epoch - 45ms/step
Epoch 2/20
157/157 - 5s - loss: 0.4124 - accuracy: 0.8152 - val_loss: 0.4173 - val_accuracy: 0.8118 - 5s/epoch - 35ms/step
Epoch 3/20
157/157 - 5s - loss: 0.3832 - accuracy: 0.8296 - val_loss: 0.4082 - val_accuracy: 0.8122 - 5s/epoch - 34ms/step
Epoch 4/20
157/157 - 5s - loss: 0.3702 - accuracy: 0.8363 - val_loss: 0.4525 - val_accuracy: 0.7976 - 5s/epoch - 35ms/step
Epoch 5/20
157/157 - 5s - loss: 0.3618 - accuracy: 0.8409 - val_loss: 0.3960 - val_accuracy: 0.8194 - 5s/epoch - 34ms/step
Epoch 6/20
157/157 - 5s - loss: 0.3539 - accuracy: 0.8460 - val_loss: 0.4228 - val_accuracy: 0.8048 - 5s/epoch - 35ms/step
Epoch 7/20
157/157 - 5s - loss: 0.3476 - accuracy: 0.8484 - val_loss: 0.4798 - val_accuracy: 0.8042 - 5s/epoch - 35ms/step
Epoch 8/20
157/157 - 5s - loss: 0.3414 - accuracy: 0.8502 - val_loss: 0.3813 - val_accuracy: 0.8286 - 5s/epoch - 35ms/step
Epoch 9/20
157/1

In [None]:
lstm_loss, lstm_accuracy = lstm_model.evaluate(test_x,test_y)



In [None]:

########################
##                    ##
## 執行測試集的評論分類 ##
##                    ##
########################


In [None]:

##最後就可用此模型執行測試資料集的評論分類：
##下半部順便定義兩個方便使用的函式，一個是 display_test_sentiment() 函式，裡面首先會呼叫第二個函式get_original_text()
##因此會先顯示目前要分類的評論內容，接著才顯示此評論的分類答案，以及模型分類的結果。


In [None]:
import numpy as np
predict=model.predict(test_x)
predict=np.argmax(predict,axis=1)
predict_class=predict.reshape(len(test_x))


def get_original_text(i):
    word_to_id = tf.keras.datasets.imdb.get_word_index()
    word_to_id = {k:(v+3) for k,v in word_to_id.items()}
    word_to_id["<PAD>"] =0
    word_to_id["<START>"] =1
    word_to_id["UNK"] =2
    id_to_word = {value:key for key,value in word_to_id.items()}
    return ' '.join(id_to_word[id] for id in test_x[i])

SemtimentDcit={1:'positive',0:'negative'}
def display_test_sentiment(content_index):
    print(get_original_text(i))
    print('label:',SemtimentDcit[test_y[i]], ',prediction:',SemtimentDcit[predict_class[i]])

In [None]:
display_test_sentiment(content_index=0)

UNK UNK the UNK side of UNK UNK br br there are some UNK UNK in this film about the only UNK i can really point out is a certain to the script in some UNK which i think is due mostly to the way this film is a four UNK fight there simply isn't enough time to UNK UNK what's going on br br UNK this is a UNK good film i highly recommend watching this in UNK with the first and then UNK for how good the series could have been had it UNK under UNK and UNK
label: positive ,prediction: negative
