In [1]:
import keras
import numpy as np
from preprocess import *
from string import punctuation
from collections import Counter

Using TensorFlow backend.


In [58]:
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
    
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [59]:
reviews_int, vocab_size = text_preprocess(reviews)
labels = convert_labels(labels)

print("number of reviews:", len(reviews_int))
print("number of labels:", len(labels)) 

number of reviews: 25001
number of labels: 25001


In [4]:
review_lens = Counter([len(x) for x in reviews_int])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2380


In [5]:
# Filter out that review with 0 length
index = 0
for i in range(len(reviews_int)):
    if len(reviews_int[i]) == 0:
        index = i
        
reviews_int.pop(index)
labels.pop(index)
print("number of reviews:", len(reviews_int))
print("number of labels:", len(labels))

number of reviews: 25000
number of labels: 25000


In [6]:
# limiting reviews to 200 words length
# if review is less then 200 then left padd it with zeros

seq_len = 200
features = truncate(reviews_int, seq_len)
labels = np.array(labels)        

In [7]:
# creating train, test, validation dataset

split_frac = 0.8

s = int(split_frac*len(features))
train_x, val_x = features[:s], features[s:]
train_y, val_y = labels[:s], labels[s:]

s = len(val_x)//2
val_x, test_x = val_x[:s], val_x[s:]
val_y, test_y = val_y[:s], val_y[s:]

print("training data:", train_x.shape)
print("testing data:", test_x.shape)
print("validation data:", val_x.shape)

training data: (20000, 200)
testing data: (2500, 200)
validation data: (2500, 200)


In [8]:
# ------------parameters--------------

seq_len = 200
embedding = 300

In [20]:
# --------creating LSTM model---------

model = keras.models.Sequential()

model.add(keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding, input_length=seq_len))
model.add(keras.layers.LSTM(128, return_sequences=True))
model.add(keras.layers.Dropout(.5))
model.add(keras.layers.LSTM(256, return_sequences=True))
model.add(keras.layers.Dropout(.5))
model.add(keras.layers.LSTM(128))
model.add(keras.layers.Dense(2, activation='softmax'))

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 300)          5651700   
_________________________________________________________________
lstm_7 (LSTM)                (None, 200, 128)          219648    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200, 128)          0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 200, 256)          394240    
_________________________________________________________________
dropout_6 (Dropout)          (None, 200, 256)          0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
Total para

In [22]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [23]:
model.fit(train_x, train_y, batch_size=500, epochs=2, validation_data=(val_x, val_y))

Train on 20000 samples, validate on 2500 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f078f7c7748>

In [24]:
val_loss, val_acc = model.evaluate(test_x, test_y, batch_size=500)
print("test loss:", val_loss)
print("test acc:", val_acc)

test loss: 0.43344812393188475
test acc: 0.826800012588501


In [25]:
model.save('lstm.model')

In [75]:
# --------------testing manually--------------

my_review = "this movie inspired me a lot"

my_review = np.array(convert_new_review(my_review))

["Negative", "Positive"][np.argmax(model.predict(my_review))]

'Positive'