In [1]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import keras
import pydot
from keras.layers import Dense, Flatten, Convolution2D, Dropout, LSTM
from keras.models import Sequential
from keras.utils import np_utils
from keras.utils.vis_utils import plot_model
import spacy

Using TensorFlow backend.


In [2]:
# load all the preprocessed movie reviews 
d_pos = np.load('movie_review/positive.npy')
d_neg = np.load('movie_review/negative.npy')

data = np.concatenate((d_pos, d_neg))
print(data.shape)

(8107, 34, 300)


In [3]:
# assign labels to positive (1) and negative (0) reviews
labels = np.zeros((data.shape[0], 1))
labels[:d_pos.shape[0]] = 1

target = np_utils.to_categorical(labels)
print(target.shape)

(8107, 2)


In [4]:
ind = range(data.shape[0])
print(ind)

range(0, 8107)


In [5]:
#Split into train and test set
ind = list(range(data.shape[0]))
split = int(0.9 * data.shape[0])

np.random.shuffle(ind)

X_train = data[ind[:split]].reshape((-1, data.shape[1], data.shape[2]))#, 1))
X_test = data[ind[split:]].reshape((-1, data.shape[1], data.shape[2]))#, 1))

y_train = target[ind[:split]]
y_test = target[ind[split:]]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(7296, 34, 300) (811, 34, 300)
(7296, 2) (811, 2)


In [6]:
# Conv model
model = Sequential()

model.add(Convolution2D(32, (7, 300), input_shape=(data.shape[1], 300, 1), activation='tanh'))
model.add(Convolution2D(16, (5, 1), activation='tanh'))
model.add(Convolution2D(16, (3, 1), activation='tanh'))

model.add(Flatten())
model.add(Dropout(0.2))

model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.4))

model.add(Dense(2, activation='softmax'))

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 28, 1, 32)         67232     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 24, 1, 16)         2576      
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 22, 1, 16)         784       
_________________________________________________________________
flatten_1 (Flatten)          (None, 352)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 352)               0         
_________________________________________________________________
dense_1 (Dense)      

In [7]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [8]:
# LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(data.shape[1], 300), return_sequences=True, activation='tanh'))
model.add(Dropout(0.4))
model.add(LSTM(128, activation='tanh'))
model.add(Dense(2, activation='softmax'))

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 34, 128)           219648    
_________________________________________________________________
dropout_3 (Dropout)          (None, 34, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
Total params: 351,490
Trainable params: 351,490
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Encoder-decoder model
# Example for seq2seq

#model = Sequential()

#model.add(LSTM(128, input_shape=(data.shape[1], 300), return_sequences=True, activation='tanh'))
#model.add(LSTM(128, return_sequences=False))
#model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.summary()

In [10]:
hist = model.fit(X_train, y_train,
                epochs=20,
                shuffle=True,
                batch_size=100,
                validation_data=(X_test, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 7296 samples, validate on 811 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
word_vec = spacy.load('en_vectors_web_lg')
def sequence_to_mat(seq, lower_limit=10, upper_limit=35):
    vec_seq = word_vec(str(seq))
    if len(vec_seq) > lower_limit and len(vec_seq) < upper_limit:
        m = np.ones((upper_limit-1, 300))*5.0
        
        for ix in range(len(vec_seq)):
            m[ix, :] = vec_seq[ix].vector
        return m
    else:
        return None

In [13]:
w = 'this has been an amazing movie definitely recommend it for watching'
#w = 'this was not a decent experience, I had to sit there for a while'
mat = sequence_to_mat(w)

# example = np.expand_dims(np.expand_dims(mat, axis=0), axis=-1)
example = np.expand_dims(mat, axis=0)
print(example.shape)

(1, 34, 300)


In [14]:
model.predict(example)

array([[0.01550869, 0.9844913 ]], dtype=float32)