# 自然語言處理&RNN預測情緒 on IMDb影評

## 載入IMDb資料集

In [8]:
import urllib.request
import os
import tarfile

In [9]:
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
path = "aclImdb_v1.tar.gz"
if not os.path.isfile(path):
  result = urllib.request.urlretrieve(url,path)
  print("downloaded")

In [10]:
if not os.path.exists("aclImdb"):
  tfile = tarfile.open("aclImdb_v1.tar.gz" , "r:gz")
  result = tfile.extractall('')

## 資料前處理

In [11]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [12]:
import re
def remove_tag(text):
  re_tag = re.compile(r'(<[^>]+>|\.|\,)')
  return re_tag.sub('',text)

In [13]:
import os
def read_files(filetype):
  path = "aclImdb/"
  file_list =[]
  postive_path = path + filetype + "/pos/"
  for f in os.listdir(postive_path):
    file_list +=[postive_path+f]
    
  negtive_path = path + filetype + "/neg/"
  for f in os.listdir(negtive_path):
    file_list +=[negtive_path+f]
  all_labels = ([1]*12500 + [0]*12500)
  all_texts =[]
  #print(file_list) 
  for fi in file_list:
    with open(fi , encoding='utf-8') as file_input:
      all_texts += [remove_tag(" ".join(file_input.readlines()))]
      #print (file_input.readlines())
  return all_labels , all_texts

y_train , train_text = read_files("train")
y_test , test_text = read_files("test")
o_train_text = train_text
o_test_text = test_text

觀察資料格式與**type**

In [14]:
print("training data size:%d"%(len(train_text)))
print("testing data size:%d"%(len(test_text)))
print("feature :%s"%(train_text[0]))
print("label :%s"%(y_train[0]))

training data size:25000
testing data size:25000
feature :For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer The Moroni character is an absolute scream Watch for Alan "The Skipper" Hale jr as a police Sgt
label :1


### 建立token 
對於每個word建立一個index

In [232]:
token = Tokenizer(num_words = 2000)
token.fit_on_texts(train_text)

In [233]:
#display word index table
print(token.word_index)



In [234]:
#convert text to vector
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

#compare the text and vector
print(train_text[0])
print(x_train_seq[0])

#padding 
x_train = sequence.pad_sequences(x_train_seq , maxlen=100)
x_test = sequence.pad_sequences(x_test_seq , maxlen=100)

#show padding result
print("before length: %d"%(len(x_train_seq[0])))
print("before squence: %s"%(x_train_seq[0]))

print("after length: %d"%(len(x_train[0])))
print("after squence: %s"%(x_train[0]))

For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer The Moroni character is an absolute scream Watch for Alan "The Skipper" Hale jr as a police Sgt
[14, 3, 16, 11, 207, 54, 1166, 47, 248, 22, 3, 172, 4, 905, 14, 9, 1538, 838, 3, 16, 116, 907, 6, 157, 160, 6, 3, 131, 1, 105, 6, 31, 1533, 1999, 103, 14, 1599, 1, 1824, 13, 3, 557]
before length: 42
before squence: [14, 3, 16, 11, 207, 54, 1166, 47, 248, 22, 3, 172, 4, 905, 14, 9, 1538, 838, 3, 16, 116, 907, 6, 157, 160, 6, 3, 131, 1, 105, 6, 31, 1533, 1999, 103, 14, 1599, 1, 1824, 13, 3, 557]
after length: 100
after squence: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0

## build RNN model

In [247]:
from keras.models import Sequential
from keras.layers.core import Dense , Dropout , Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM, GRU,RNN
from keras.layers import Bidirectional

In [280]:
model = Sequential()
model.add(Embedding(output_dim=32,
                   input_dim=2000,
                   input_length=100))
model.add(Dropout(0.35))
#model.add(SimpleRNN(units=16)) #32*16 + 16 +16*16
#model.add(LSTM(units=16)) #4 * (RNN number)
model.add(Bidirectional(GRU(units=16), input_shape=(100, 32)))
#model.add(Bidirectional(GRU(units=16)))
#model.add(GRU(units=16))
model.add(Dense(units=256,activation='relu')) 
model.add(Dropout(0.35))
model.add(Dense(units=1,activation='sigmoid'))

In [281]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_46 (Embedding)     (None, 100, 32)           64000     
_________________________________________________________________
dropout_78 (Dropout)         (None, 100, 32)           0         
_________________________________________________________________
bidirectional_38 (Bidirectio (None, 32)                4704      
_________________________________________________________________
dense_67 (Dense)             (None, 256)               8448      
_________________________________________________________________
dropout_79 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_68 (Dense)             (None, 1)                 257       
Total params: 77,409
Trainable params: 77,409
Non-trainable params: 0
_________________________________________________________________


In [282]:
model.compile(loss='binary_crossentropy',
             optimizer="adam",
             metrics=['accuracy'])

## train model

In [283]:
train_history = model.fit(x_train , y_train , batch_size=100 ,epochs=10 , verbose=2,validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 92s - loss: 0.5518 - acc: 0.7083 - val_loss: 0.4263 - val_acc: 0.7972
Epoch 2/10
 - 80s - loss: 0.3492 - acc: 0.8484 - val_loss: 0.5099 - val_acc: 0.7384
Epoch 3/10
 - 80s - loss: 0.3137 - acc: 0.8679 - val_loss: 0.3292 - val_acc: 0.8578
Epoch 4/10
 - 79s - loss: 0.2920 - acc: 0.8776 - val_loss: 0.6559 - val_acc: 0.7090
Epoch 5/10
 - 80s - loss: 0.2737 - acc: 0.8874 - val_loss: 0.4180 - val_acc: 0.8096
Epoch 6/10
 - 80s - loss: 0.2614 - acc: 0.8931 - val_loss: 0.6897 - val_acc: 0.7174
Epoch 7/10
 - 81s - loss: 0.2428 - acc: 0.9013 - val_loss: 0.4589 - val_acc: 0.8110
Epoch 8/10
 - 80s - loss: 0.2378 - acc: 0.9030 - val_loss: 0.4671 - val_acc: 0.8100
Epoch 9/10
 - 80s - loss: 0.2194 - acc: 0.9120 - val_loss: 0.4708 - val_acc: 0.8244
Epoch 10/10
 - 80s - loss: 0.2104 - acc: 0.9158 - val_loss: 0.5800 - val_acc: 0.7658


## evaluate & check result

In [284]:
scores = model.evaluate(x_test , y_test ,verbose=1)
scores[1]



0.83088

In [285]:
predict = model.predict_classes(x_test).reshape(-1)

In [214]:
SentimentDict = {1:"postive" , 0:"negtive"}
def display_test_Sentiment(i):
  print(test_text[i])
  print("truth:",SentimentDict[y_test[i]],"predict:",SentimentDict[predict[i]])

In [215]:
display_test_Sentiment(1)

This is a gem As a Film Four production - the anticipated quality was indeed delivered Shot with great style that reminded me some Errol Morris films well arranged and simply gripping It's long yet horrifying to the point it's excruciating We know something bad happened (one can guess by the lack of participation of a person in the interviews) but we are compelled to see it a bit like a car accident in slow motion The story spans most conceivable aspects and unlike some documentaries did not try and refrain from showing the grimmer sides of the stories as also dealing with the guilt of the people Don left behind him wondering why they didn't stop him in time It took me a few hours to get out of the melancholy that gripped me after seeing this very-well made documentary
truth: postive predict: postive


## predict on real data

In [216]:
review = ("This film has been saved by its stars: Ryan Reynolds first and Kevin Costner few minutes later will make you forget most of plot's unbelievable logical holes, awful editing (what's Alice Eve's end?), repetitive scenes and complete lack of bad guy's motive (Jordi Mollà)."+

"And Tommy Lee Jones, Gal Gadot, Gary Oldman and Michael Pitt (Alice Eve here is little more than an extra) strive to fill with workmanship and dignity their otherwise gaunt supporting roles."+

"They all succeed in saving the day and in the end you won't fall asleep or leave theater in contempt. But on your way home you could probably comment that this is one of the most useless (or unconvincing) films you ever watched.")

In [217]:
def preprocessor(text):
  x_test_1_seq = token.texts_to_sequences([text])
  x_test_1_seq_pad = sequence.pad_sequences( x_test_1_seq , maxlen=100)
  return  x_test_1_seq_pad

In [218]:
x_test_1 = preprocessor(review)

In [219]:
predict = model.predict_classes(x_test_1)

In [220]:
print(review)
print("predict:",SentimentDict[predict[0][0]])

This film has been saved by its stars: Ryan Reynolds first and Kevin Costner few minutes later will make you forget most of plot's unbelievable logical holes, awful editing (what's Alice Eve's end?), repetitive scenes and complete lack of bad guy's motive (Jordi Mollà).And Tommy Lee Jones, Gal Gadot, Gary Oldman and Michael Pitt (Alice Eve here is little more than an extra) strive to fill with workmanship and dignity their otherwise gaunt supporting roles.They all succeed in saving the day and in the end you won't fall asleep or leave theater in contempt. But on your way home you could probably comment that this is one of the most useless (or unconvincing) films you ever watched.
predict: negtive


## 課堂練習
1.修改Tokenizer(num_words = 2000)
<p>
2.修改padding的maxlen 
<p>
3.embedding的output size
<p>
4.用LSTM取代RNN
<p>
5.更改模型的參數及架構 example: RNN units , dropout , epochs , ect