# Machine Learning - Sentiment Analysis IMDb Dataset (using LSTM, GRU)

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame()
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [2]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [3]:
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [5]:
# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(50000,)
(50000,)


In [10]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


tokenizer_obj = Tokenizer()
total_reviews = X_train + X_test
tokenizer_obj.fit_on_texts(total_reviews) 

# pad sequences
max_length = 100 # try other options like mean
# define vocabulary size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens =  tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [15]:
print(vocab_size)

125602


In [12]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

print('Build model...')

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Build model...
Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2678, 100)         12560200  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 12,573,001
Trainable params: 12,573,001
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
 - 627s - loss: 0.5720 - acc: 0.7023 - val_loss: 0.4527 - val_acc: 0.7915
Epoch 2/25
 - 528s - loss: 0.3987 - acc: 0.8283 - val_loss: 0.3977 - val_acc: 0.8248
Epoch 3/25
 - 310s - loss: 0.3556 - acc: 0.8517 - val_loss: 0.3921 - val_acc: 0.8291
Epoch 4/25
 - 314s - loss: 0.3321 - acc: 0.8627 - val_loss: 0.4128 - val_acc: 0.8163
Epoch 5/25
 - 315s - loss: 0.3081 - acc: 0.8730 - val_loss: 0.3972 - val_acc: 0.8252
Epoch 6/25
 - 318s - loss: 0.2893 - acc: 0.8848 - val_loss: 0.3822 - val_acc: 0.8337
Epoch 7/25
 - 319s - loss: 0.2732 - acc: 0.8910 - val_loss: 0.3745 - val_acc: 0.8438
Epoch 8/25
 - 331s - loss: 0.2539 - acc: 0.9012 - val_loss: 0.5966 - val_acc: 0.7738
Epoch 9/25
 - 326s - loss: 0.2607 - acc: 0.8956 - val_loss: 0.4067 - val_acc: 0.8280
Epoch 10/25
 - 337s - loss: 0.2308 - acc: 0.9115 - val_loss: 0.3723 - val_acc: 0.8512
Epoch 11/25
 - 336s - loss: 0.2213 - acc: 0.9152 - val_loss: 0.3656 - val_acc: 0.8543
Epoc

<keras.callbacks.History at 0x2ad8cc90208>

In [10]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.44936159363269806
Test accuracy: 0.8537999999809265
Accuracy: 85.38%


In [11]:
#Let us test some  samples
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.9916264 ],
       [0.29879633],
       [0.0971105 ],
       [0.00360127],
       [0.16327332],
       [0.0138877 ],
       [0.16362512],
       [0.00134153]], dtype=float32)

In [12]:
#let us check how the model predicts
classes = model.predict(X_test_pad[:10], batch_size=128)
for i in range (0,10):
    if(classes[i] > 0.5 and y_test[i] == 1 or (classes[i] <= 0.5 and y_test[i] == 0)):
        print( classes[i], y_test[i], " Right prdiction")
    else :
        print( classes[i], y_test[i], " Wrong prdiction")

[0.1360948] 1  Wrong prdiction
[0.29261437] 1  Wrong prdiction
[0.9970799] 1  Right prdiction
[0.87384164] 1  Right prdiction
[0.99331176] 1  Right prdiction
[0.97243744] 1  Right prdiction
[0.85155106] 1  Right prdiction
[0.7417877] 1  Right prdiction
[0.9908635] 1  Right prdiction
[0.9956833] 1  Right prdiction


In [13]:
from keras.datasets import imdb
from keras.models import Sequential
from tensorflow.python.keras.preprocessing import sequence
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print('Build model...')

model = Sequential()
model.add(Embedding(top_words, 100, input_length=max_words))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 168,353
Trainable params: 168,353
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
print('Train...')

model.fit(X_train, y_train, batch_size=128, epochs=25, validation_data=(X_test, y_test), verbose=2)

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
 - 513s - loss: 0.1798 - acc: 0.9324 - val_loss: 0.4092 - val_acc: 0.8448
Epoch 2/25
 - 556s - loss: 0.1890 - acc: 0.9283 - val_loss: 0.4166 - val_acc: 0.8473
Epoch 3/25
 - 946s - loss: 0.1736 - acc: 0.9345 - val_loss: 0.4416 - val_acc: 0.8446
Epoch 4/25
 - 651s - loss: 0.1599 - acc: 0.9400 - val_loss: 0.4562 - val_acc: 0.8464
Epoch 5/25
 - 688s - loss: 0.1635 - acc: 0.9390 - val_loss: 0.4382 - val_acc: 0.8432
Epoch 6/25
 - 1021s - loss: 0.1507 - acc: 0.9440 - val_loss: 0.4438 - val_acc: 0.8478
Epoch 7/25
 - 518s - loss: 0.1392 - acc: 0.9476 - val_loss: 0.4582 - val_acc: 0.8507
Epoch 8/25
 - 516s - loss: 0.1573 - acc: 0.9391 - val_loss: 0.4841 - val_acc: 0.8454
Epoch 9/25
 - 785s - loss: 0.1377 - acc: 0.9500 - val_loss: 0.4811 - val_acc: 0.8469
Epoch 10/25
 - 1003s - loss: 0.1353 - acc: 0.9508 - val_loss: 0.4742 - val_acc: 0.8494
Epoch 11/25
 - 630s - loss: 0.1753 - acc: 0.9315 - val_loss: 0.4758 - val_acc: 0.8385
Ep

<keras.callbacks.History at 0x2ad8e3ad080>

In [18]:
score, acc = model.evaluate(X_test, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: %.2f%%" % (acc*100))

Test score: 0.5993069805335999
Test accuracy: 0.839160000038147
Accuracy: 83.92%


The time to train a GRU is less than LSTM network.