In [1]:

####################################
##                ##
## 載入與了解IMDB網路電影資料集 ##
##                ##
####################################

In [2]:

##載入tensorflow做使用
##tensorflow的tf.keras.dataset.imdb已內建imdb的資料集

In [4]:
import tensorflow as tf
top_words =1000
(train_x,train_y), (test_x,test_y) = tf.keras.datasets.imdb.load_data(num_words=top_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [5]:
##下載後會將資料集的訓練與測試資料分別儲存在(train_x, train_y)、(test_x,test_y)中
##可透過shape指令顯示訓練和測試資料集內各維度的資料數量（也稱為形狀），顯示各訓練與測試資料集的資料數量都是25,000筆：
print("train_x's shape:'{0}".format(train_x.shape))
print("train_y's shape:'{0}".format(train_y.shape))
print("test_x's shape:'{0}".format(test_x.shape))
print("test_y's shape:'{0}".format(test_y.shape))

train_x's shape:'(25000,)
train_y's shape:'(25000,)
test_x's shape:'(25000,)
test_y's shape:'(25000,)


In [7]:
##也可檢視訓練資料集內第1筆評論資料（矩陣索引值從0起算），及其對應的標籤資料：
##下面data的輸出結果顯示為一個整數值的矩陣，這是因為該評論的單字已置換成「單字 - 索引」（Word-index）
##而該索引對應到單字 - 索引字典。標籤的部分，整數1表示正面評價，0表示負面評價。
print("data:'{0}".format(train_x[0]))
print("label:'{0}".format(train_y[0]))

data:'[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
label:'1


In [8]:

##################
##       ##
## 資料預處理 ##
##       ##
##################

In [9]:
##  會將每筆資料依情況填充或剪裁以符合此設定數量，在這次的範例中，我們將評論內容長度設定為100。
from tensorflow.keras.preprocessing import sequence
max_len_word=100
train_x = sequence.pad_sequences(sequences=train_x, maxlen=max_len_word)
test_x = sequence.pad_sequences(sequences=test_x, maxlen=max_len_word)

In [10]:

##執行後可以透過shape查看各維度的資料數量，確認25,000個評論的長度皆為100：
print("train_x's shape:{0}".format(train_x.shape))
print("test_x's shape:{0}".format(test_x.shape))

train_x's shape:(25000, 100)
test_x's shape:(25000, 100)


In [11]:

################
##      ##
## 建構Model ##
##      ##
################

In [12]:
from keras.models import Sequential
from keras.layers import Dense

In [13]:
model = Sequential()
model.add(Dense(units=128, input_dim=100, activation='relu'))
model.add(Dense(units=32))
model.add(Dense(units=8))
model.add(Dense(units=1, activation='sigmoid'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               12928     
                                                                 
 dense_1 (Dense)             (None, 32)                4128      
                                                                 
 dense_2 (Dense)             (None, 8)                 264       
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 17329 (67.69 KB)
Trainable params: 17329 (67.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [14]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [15]:
history = model.fit(train_x,train_y,epochs=30, batch_size=20,verbose=2)

Epoch 1/30
1250/1250 - 6s - loss: 5.0958 - accuracy: 0.4976 - 6s/epoch - 5ms/step
Epoch 2/30
1250/1250 - 2s - loss: 0.9494 - accuracy: 0.5059 - 2s/epoch - 2ms/step
Epoch 3/30
1250/1250 - 2s - loss: 0.7631 - accuracy: 0.5141 - 2s/epoch - 2ms/step
Epoch 4/30
1250/1250 - 2s - loss: 0.7173 - accuracy: 0.5252 - 2s/epoch - 2ms/step
Epoch 5/30
1250/1250 - 2s - loss: 0.6991 - accuracy: 0.5419 - 2s/epoch - 2ms/step
Epoch 6/30
1250/1250 - 4s - loss: 0.6870 - accuracy: 0.5518 - 4s/epoch - 3ms/step
Epoch 7/30
1250/1250 - 3s - loss: 0.6766 - accuracy: 0.5671 - 3s/epoch - 2ms/step
Epoch 8/30
1250/1250 - 4s - loss: 0.6661 - accuracy: 0.5803 - 4s/epoch - 3ms/step
Epoch 9/30
1250/1250 - 2s - loss: 0.6532 - accuracy: 0.5935 - 2s/epoch - 2ms/step
Epoch 10/30
1250/1250 - 4s - loss: 0.6401 - accuracy: 0.6074 - 4s/epoch - 3ms/step
Epoch 11/30
1250/1250 - 2s - loss: 0.6240 - accuracy: 0.6247 - 2s/epoch - 2ms/step
Epoch 12/30
1250/1250 - 2s - loss: 0.6040 - accuracy: 0.6405 - 2s/epoch - 2ms/step
Epoch 13/30
1

In [16]:

#######################

# 建構Autoencoder_dnn

#######################

In [None]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# 參數設定
max_length = 100      # 句子的最大長度
encoding_dim = 16     # 編碼的目標維度

# 定義自編碼器模型
input_layer = Input(shape=(max_length,))  # 輸入層維度是句子長度

# 編碼部分


# 解碼部分




autoencoder_dnn = Model(input_layer, decoded)
autoencoder_dnn.compile(optimizer='adam', loss='mse')  # 使用均方誤差作為損失函數
autoencoder_dnn.summary()
autoencoder_dnn.fit(train_x, train_x, epochs=10, batch_size=32, validation_split=0.2)

In [17]:
#有什麼問題呢? 該怎麼修改?

In [18]:
#利用encoder的句子進行類似word2vec的轉換，最終將結果丟給簡易的深度學習模型進行預測

In [19]:

#######################

# 建構Autoencoder_lstm

#######################

In [20]:
#利用lstm進行autoencoder的訓練，效果會不會比dnn來的好?
#模型設計在encoder與encoder裡面各有一層lstm，其中在什麼時候我需要使用many to many的傳遞參數?
#sparse_categorical_crossentropy與categorical_crossentropy的差別是什麼?