In [1]:
#匯入所需模組
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np
np.random.seed(10)

In [2]:
#使用正規表示式Regular Expression
#建立rm_tag函數移除文字中html tag
import re                          #匯入Regular Expression模組
def rm_tags(text):                 #建立rm_tags函數，輸入參數text文字
    re_tag = re.compile(r'<[^>]+>')#建立rm_tag正規表示式變數為'<[^>]+>'
    return re_tag.sub('',text)     #使用re_tag將text文字中，符合正規表示式條件的字替換成空字串

In [3]:
#建立read_files函數讀取IMDb檔案目錄
import os
def read_files(filetype):             #建立read_files函數，輸入參數filetype 讀取[訓練/測試]資料會傳入"[train/test]" 
    path = "C:/Users/USER/aclImdb/"   #設定存取路徑
    file_list=[]                      #建立檔案list
    
    positive_path = path + filetype + "/pos/"  #設定正面評價的檔案目錄為positive_path
    for f in os.listdir(positive_path):         #使用for將positive_path目錄下所有的檔案加入file_list
        file_list +=[positive_path + f]
    
    negative_path = path + filetype + "/neg/"  #設定正面評價的檔案目錄為negative_path
    for f in os.listdir(negative_path):         #使用for將negative_path目錄下所有的檔案加入file_list
        file_list +=[negative_path + f]
    
    print('read',filetype,'files:',len(file_list)) #顯示目前讀取的filetype("train或test")，目錄下的檔案個數
    
    all_labels = ([1]*12500+[0]*12500)             #產生all_label因為前12500筆是正面，所以產生12500筆1的list，0則為負面
    
    all_texts = []            #設定all_text為空list
    for fi in file_list:   #fi讀取所有file_list檔案
        with open(fi,encoding='utf-8') as file_input: #使用open(fi,encoding='utf-8')開啟檔案為file_input
#使用使用 file_input.readlines()讀取檔案，並使用join連接所有檔案內容，然後使用rm_tags移除tag，最後加入 all_texts list
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
  
    return all_labels,all_texts

In [4]:
#讀取IMDb資料及目錄
y_train,train_text = read_files("train")

read train files: 25000


In [5]:
#讀取測試資料
y_test,test_text = read_files("test")

read test files: 25000


In [6]:
#建立Token
#建立Token
token = Tokenizer(num_words=2000) #使用Tokenizer建立token，輸入參數num_words=2000，也就是建立2000個字的字典
token.fit_on_texts(train_text)    #讀取所有的訓練資料影評，依照每個英文字，在影評中出現的次數進行排序，排序的前2000個會列入字典中

In [7]:
#使用token.texts_to_sequences將訓練資料與測試資料的影評文字轉換籌數字list。
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [8]:
#使用sequence.pad_sequences()方法截長補短
x_train = pad_sequences(x_train_seq,maxlen=100)
x_test = pad_sequences(x_test_seq,maxlen=100)

In [9]:
from keras.models import Sequential
from keras.layers.core import Dense,Dropout,Activation,Flatten
from keras.layers import Embedding 

In [10]:
#建立模型
model  = Sequential()

In [11]:
#將Embedding加入模型
model.add(Embedding(output_dim = 32,    #輸出的維度32 希望把「數字list」換為32維度的向量
                    input_dim = 2000,   #輸入的維度是2000，因為之前建立的字典是2000字
                    input_length = 100  #「數字list」每一筆有100個數字 所以長度為100
                   ))
model.add(Dropout(0.2)) #每次訓練迭代時，隨機在神經往綠中放棄20%的神經元

In [12]:
#建立多層感知器模型
model.add(Flatten())

In [13]:
#加入隱藏層
model.add(Dense(
    units=256,        #隱藏層共有256個神經元
    activation='relu'
))

model.add(Dropout(0.35))

In [14]:
#加入輸出層
model.add(Dense(
    units = 1,    #輸出層只有一個神經元，1代表正面評價，0代表負面評價
    activation = 'sigmoid'
))

In [15]:
#查看摘要模型
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           64000     
                                                                 
 dropout (Dropout)           (None, 100, 32)           0         
                                                                 
 flatten (Flatten)           (None, 3200)              0         
                                                                 
 dense (Dense)               (None, 256)               819456    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 883,713
Trainable params: 883,713
Non-trai

In [16]:
#定義訓練方式
model.compile(
    loss='binary_crossentropy', #在深度學習使用cross_entropy，訓練效果比較好
    optimizer = 'adam',
    metrics=['accuracy']
)

In [17]:
y_train = np.array(y_train)
y_test=np.array(y_test)

In [18]:
#開始訓練

train_history = model.fit(x_train,              #feature(「數字list」)
                          y_train,              #測試資料的標籤label(正向:1、負向:0)
                          batch_size = 100,
                          epochs = 10,
                          verbose = 2,
                          validation_split = 0.2
                         )

Epoch 1/10
200/200 - 3s - loss: 0.4814 - accuracy: 0.7556 - val_loss: 0.4091 - val_accuracy: 0.8204 - 3s/epoch - 17ms/step
Epoch 2/10
200/200 - 3s - loss: 0.2724 - accuracy: 0.8878 - val_loss: 0.4835 - val_accuracy: 0.7926 - 3s/epoch - 13ms/step
Epoch 3/10
200/200 - 3s - loss: 0.1611 - accuracy: 0.9420 - val_loss: 0.5777 - val_accuracy: 0.7792 - 3s/epoch - 13ms/step
Epoch 4/10
200/200 - 3s - loss: 0.0821 - accuracy: 0.9732 - val_loss: 0.8944 - val_accuracy: 0.7376 - 3s/epoch - 13ms/step
Epoch 5/10
200/200 - 3s - loss: 0.0490 - accuracy: 0.9829 - val_loss: 1.0648 - val_accuracy: 0.7452 - 3s/epoch - 13ms/step
Epoch 6/10
200/200 - 3s - loss: 0.0354 - accuracy: 0.9871 - val_loss: 1.1671 - val_accuracy: 0.7456 - 3s/epoch - 13ms/step
Epoch 7/10
200/200 - 3s - loss: 0.0308 - accuracy: 0.9901 - val_loss: 1.0585 - val_accuracy: 0.7780 - 3s/epoch - 13ms/step
Epoch 8/10
200/200 - 3s - loss: 0.0279 - accuracy: 0.9902 - val_loss: 1.3919 - val_accuracy: 0.7350 - 3s/epoch - 13ms/step
Epoch 9/10
200/2

In [19]:
#評估模型準確率
scores = model.evaluate(x_test,y_test,verbose = 1)
scores[1]



0.8153600096702576

In [91]:
#進行預測
prediction = model.predict(x_test) #使用model.predict_classes進行預測
classes_x=np.argmax(prediction,axis=1)



In [94]:
classes_x[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [96]:
classes_x = classes_x.reshape(-1)
classes_x[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [32]:
SentimentDict={1:'正面的',0:'負面的'}
def display_test_Sentiment(i):
    print(test_text[i])
    print('label真實值:',SentimentDict[y_test[i]],'預測結果',SentimentDict[classes_x[i]])

In [33]:
display_test_Sentiment(2)

As a recreational golfer with some knowledge of the sport's history, I was pleased with Disney's sensitivity to the issues of class in golf in the early twentieth century. The movie depicted well the psychological battles that Harry Vardon fought within himself, from his childhood trauma of being evicted to his own inability to break that glass ceiling that prevents him from being accepted as an equal in English golf society. Likewise, the young Ouimet goes through his own class struggles, being a mere caddie in the eyes of the upper crust Americans who scoff at his attempts to rise above his standing. What I loved best, however, is how this theme of class is manifested in the characters of Ouimet's parents. His father is a working-class drone who sees the value of hard work but is intimidated by the upper class; his mother, however, recognizes her son's talent and desire and encourages him to pursue his dream of competing against those who think he is inferior.Finally, the golf scenes

In [34]:
display_test_Sentiment(12502)

First of all I hate those moronic rappers, who could'nt act if they had a gun pressed against their foreheads. All they do is curse and shoot each other and acting like cliché'e version of gangsters.The movie doesn't take more than five minutes to explain what is going on before we're already at the warehouse There is not a single sympathetic character in this movie, except for the homeless guy, who is also the only one with half a brain.Bill Paxton and William Sadler are both hill billies and Sadlers character is just as much a villain as the gangsters. I did'nt like him right from the start.The movie is filled with pointless violence and Walter Hills specialty: people falling through windows with glass flying everywhere. There is pretty much no plot and it is a big problem when you root for no-one. Everybody dies, except from Paxton and the homeless guy and everybody get what they deserve.The only two black people that can act is the homeless guy and the junkie but they're actors by 

查看美女與野獸的影評

In [46]:
input_text ='''A fabulous movie, I enjoyed every moment. So beautifully done that I would watch it again. It's a true musical as they used to be. I cried and laughed, it brought out many emotions. It's a great family film. The artistry and special effects make a great Disney style fantasy come to life. The music and songs were very pleasant in typical Disney fashion.'''

In [48]:
#將影評轉換成數字list
input_seq = token.texts_to_sequences([input_text])

In [49]:
#查看數字list
print(input_seq[0])

[3, 16, 9, 506, 171, 557, 34, 1289, 220, 11, 9, 58, 102, 8, 170, 41, 3, 278, 618, 13, 32, 338, 5, 25, 9, 2, 1505, 8, 833, 42, 107, 1432, 41, 3, 84, 219, 18, 1, 2, 314, 297, 93, 3, 84, 909, 401, 935, 212, 5, 109, 1, 224, 2, 686, 67, 51, 7, 795, 909, 1596]


In [52]:
#查看list長度
len(input_seq[0])

60

In [55]:
#將list截取長度為100
pad_input_seq = pad_sequences(input_seq,maxlen=100)

In [56]:
#查看長度
len(pad_input_seq[0])

100

In [103]:
predict_result = model.predict(pad_input_seq)
classes_x = np.round(predict_result).astype(int)
#classes_x=np.argmax(prediction,axis=-1)



In [104]:
classes_x

array([[1]])

In [105]:
classes_x[0][0]

1

In [106]:
SentimentDict[classes_x[0][0]]

'正面的'

In [107]:
#把命令全部整理成predict_review()函數
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = pad_sequences(input_seq,maxlen=100)
    predict_result = model.predict(pad_input_seq)
    classes_x = np.round(predict_result).astype(int)
    print(SentimentDict[classes_x[0][0]])

In [108]:
predict_review('''I must say, Disney seems to be putting out live action movies of their successful cartoons just for the sake of putting out live action movies! I guess corporate thinking is the initial buzz will make it money on the first weekend! This is the worst of the live actions, the acting is boring, the story mundane, I do not care about the characters at all, it seems it's just made to show how cool Disney can use special effects! If you like the story, stick to the original cartoon!''')

負面的


In [109]:
predict_review('''I think this movie was watchable but I also think that this movie was unnecessary because the original was so much better. Disney was smart to cast Emma Watson as Belle, not because she was perfect for the role but they knew she was popular. I don't think she is a good actor, she works well as Hermione in the Harry Potter movie series but that's because she was just playing herself. Here she needed to be kind, humble and sweet and I don't think her version of Belle was any of that. Her singing wasn't up to par either, there was a lot of auto tuning going on with her songs.

But Disney is the master of making beautiful movies and this was no exception. But like I said, this movie was unnecessary.''')

正面的
