In [None]:

####################################
##                ##
## 載入與了解IMDB網路電影資料集 ##
##                ##
####################################

In [None]:

##載入tensorflow做使用
##tensorflow的tf.keras.dataset.imdb已內建imdb的資料集

In [1]:
import tensorflow as tf


In [None]:
##load_data函數內的參數num_words設定變數top_words，表示要取出資料集中前多少個最常出現的單字，以上面指令而言，就是取出前1,000個單字。

In [3]:
top_words = 1000
(train_x, train_y), (test_x, test_y) = tf.keras.datasets.imdb.load_data(num_words=top_words)

In [4]:
train_x = tf.keras.preprocessing.sequence.pad_sequences(train_x, maxlen=100)
train_y = tf.keras.utils.to_categorical(train_y)
test_x = tf.keras.preprocessing.sequence.pad_sequences(test_x, maxlen=100)
test_y = tf.keras.utils.to_categorical(test_y)

In [None]:
##下載後會將資料集的訓練與測試資料分別儲存在(train_x, train_y)、(test_x,test_y)中
##可透過shape指令顯示訓練和測試資料集內各維度的資料數量（也稱為形狀），顯示各訓練與測試資料集的資料數量都是25,000筆：

In [5]:
print("train_x's shape:'{0}".format(train_x.shape))
print("train_y's shape:'{0}".format(train_y.shape))
print("test_x's shape:'{0}".format(test_x.shape))
print("test_y's shape:'{0}".format(test_y.shape))

train_x's shape:'(25000, 100)
train_y's shape:'(25000, 2)
test_x's shape:'(25000, 100)
test_y's shape:'(25000, 2)


In [None]:
##也可檢視訓練資料集內第1筆評論資料（矩陣索引值從0起算），及其對應的標籤資料：
##下面data的輸出結果顯示為一個整數值的矩陣，這是因為該評論的單字已置換成「單字 - 索引」（Word-index）
##而該索引對應到單字 - 索引字典。標籤的部分，整數1表示正面評價，0表示負面評價。

In [6]:
print("data:'{0}".format(train_x[0]))
print("label:'{0}".format(train_y[0]))

data:'[  2  33   6  22  12 215  28  77  52   5  14 407  16  82   2   8   4 107
 117   2  15 256   4   2   7   2   5 723  36  71  43 530 476  26 400 317
  46   7   4   2   2  13 104  88   4 381  15 297  98  32   2  56  26 141
   6 194   2  18   4 226  22  21 134 476  26 480   5 144  30   2  18  51
  36  28 224  92  25 104   4 226  65  16  38   2  88  12  16 283   5  16
   2 113 103  32  15  16   2  19 178  32]
label:'[0. 1.]


In [None]:

##若欲顯示回原始文字，可透過下面技巧進行解碼，將整數陣列顯示回單字。首先利用imdb模組中的get_word_index()函數取得單字索引字典：
##試看看he這個詞後面接的index以及word是什麼?


In [7]:
words_mapping = tf.keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [8]:
print("word [he]'s index is:{0}".format(words_mapping["he"]))

word [he]'s index is:26


In [None]:

##接著可以用下面技巧製作「索引 - 單字」（Index-words）字典：

In [9]:
indice_mapping = dict([(value,key) for (key,value) in words_mapping.items()])

In [10]:
print("index [400]'s word is:{0}".format(indice_mapping[400]))
print("index [317]'s word is:{0}".format(indice_mapping[317]))

index [400]'s word is:name
index [317]'s word is:half


In [None]:

##最後就可以將訓練資料集第 1 筆評論資料顯示回單字
##解碼後的結果中，可看到內容中含有一些「？」，表示該單字沒對應的索引。

In [11]:
def decode_review(target):
    return (" ".join([indice_mapping.get(i-3,"?") for i in target]))
decode_review(train_x[0])


"? at a film it must have been good and this definitely was also ? to the two little ? that played the ? of ? and paul they were just brilliant children are often left out of the ? ? i think because the stars that play them all ? up are such a big ? for the whole film but these children are amazing and should be ? for what they have done don't you think the whole story was so ? because it was true and was ? life after all that was ? with us all"

In [None]:

##################
##       ##
## 資料預處理 ##
##       ##
##################

In [None]:

##為了便於神經網路的訓練，接下來要先對評論的內容執行預處理。
##由於我們會將評論以批次的方式傳入輸入層進行神經網路訓練，因此必須先將各筆評論的長度（也就是單字數量）填充或剪裁成相同長度。
##首先檢視前10筆評論內容的長度，從輸出結果可以發現各則評論的字數皆不相同：


In [12]:
for i in range(10):
    print(len(train_x[i]))

100
100
100
100
100
100
100
100
100
100


In [None]:
##  會將每筆資料依情況填充或剪裁以符合此設定數量，在這次的範例中，我們將評論內容長度設定為150。


In [None]:

##執行後可以透過shape查看各維度的資料數量，確認25,000個評論的長度皆為100：

In [13]:
print("train_x's shape:{0}".format(train_x.shape))
print("test_x's shape:{0}".format(test_x.shape))

train_x's shape:(25000, 100)
test_x's shape:(25000, 100)


In [None]:

################
##      ##
## 建構Model ##
##      ##
################

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
##Coding here


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                1010      
                                                                 
 dense_1 (Dense)             (None, 32)                352       
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1907 (7.45 KB)
Trainable params: 1907 (7.45 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [None]:
history = model.fit(train_x,train_y,epochs=100, batch_size=20,verbose=2)

Epoch 1/100
1250/1250 - 3s - loss: 6.6206 - accuracy: 0.5061 - 3s/epoch - 2ms/step
Epoch 2/100
1250/1250 - 2s - loss: 2.1462 - accuracy: 0.5021 - 2s/epoch - 1ms/step
Epoch 3/100
1250/1250 - 2s - loss: 1.3579 - accuracy: 0.5007 - 2s/epoch - 1ms/step
Epoch 4/100
1250/1250 - 2s - loss: 1.0233 - accuracy: 0.4947 - 2s/epoch - 1ms/step
Epoch 5/100
1250/1250 - 2s - loss: 0.8146 - accuracy: 0.5026 - 2s/epoch - 2ms/step
Epoch 6/100
1250/1250 - 2s - loss: 0.7562 - accuracy: 0.5053 - 2s/epoch - 2ms/step
Epoch 7/100
1250/1250 - 2s - loss: 0.7633 - accuracy: 0.5003 - 2s/epoch - 1ms/step
Epoch 8/100
1250/1250 - 2s - loss: 0.7600 - accuracy: 0.5048 - 2s/epoch - 1ms/step
Epoch 9/100
1250/1250 - 2s - loss: 0.7630 - accuracy: 0.4996 - 2s/epoch - 1ms/step
Epoch 10/100
1250/1250 - 2s - loss: 0.7546 - accuracy: 0.4986 - 2s/epoch - 1ms/step
Epoch 11/100
1250/1250 - 2s - loss: 0.7519 - accuracy: 0.5082 - 2s/epoch - 1ms/step
Epoch 12/100
1250/1250 - 3s - loss: 0.7380 - accuracy: 0.5089 - 3s/epoch - 2ms/step
E