# Preprocessing

In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, GlobalMaxPooling1D, Conv1D, Activation


Using TensorFlow backend.


In [2]:
import glob
import os
from random import shuffle
import numpy as np
np.random.seed(42)

In [3]:
def get_data(filepath):
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    positive_label = 1
    negative_label = 0
    data = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as file:
            data.append((positive_label, file.read()))
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as file:
            data.append((negative_label, file.read()))
    shuffle(data)
    return data

In [4]:
imdb_path = os.path.join(os.getcwd(), 'aclImdb/train')

In [5]:
labeled_data_long = get_data(imdb_path)

In [6]:
labeled_data = labeled_data_long [:10240]

In [7]:
labeled_data[0]

(0,
 "How many times do we have to see bad horror movies with a killer in a Scream-ripoff mask? The plot of Bleed is kinda original but the movie itself is a complete failure. For one thing the dialogue is poorly developed, it's boring and wastes too much time on trivial details, the acting is bad, et cetera. I want my money back, this rental stinks worse than a skunk.")

# word vectors

In [8]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors
import copy
import numpy as np
import pandas as pd

In [9]:
word_vectors = KeyedVectors.load_word2vec_format('../word vectors/word2vec/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)

In [10]:
def vectorize(dataset):
#     tokenized_dataset = []
    tokenizer = TreebankWordTokenizer()
    entries = []
    for entry in dataset:
        tokens = tokenizer.tokenize(entry)
        token_vectors = []
        for token in tokens:
            try:
                token_vectors.append(word_vectors[token])
            except KeyError:
                pass
        entries.append(token_vectors)
#         tokenized_dataset.append((dataset[0], tokenizer.tokenize(dataset[1])))
    return entries
        
        

In [11]:
# !pwd

In [12]:
# labeled_data[0][1]

In [13]:
# sample = vectorize(labeled_data[:3])

In [14]:
# len(sample[2])

In [15]:
reviews = [review[1] for review in labeled_data]

In [16]:
vectorized_reviews = vectorize(reviews)
labels = [ld[0] for ld in labeled_data]

In [17]:
# len(vectorized_reviews[0])
vectorized_sample = vectorize([( """"I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend.""")])

In [18]:
len(vectorized_sample[0])

37

In [19]:

# np.mean(labels)

In [20]:
split_point = int(len(labels)*.8)

In [21]:
x_train, x_test = vectorized_reviews[:split_point], vectorized_reviews[split_point:]
y_train, y_test = labels[:split_point], labels[split_point:]

# padding

In [22]:
def pad_truncate(x1, token_size=300, max_len=400):
    x = copy.copy(x1)
    zeros = np.zeros(token_size)
    for i, review in enumerate(x):
        review = list(review)
        if len(review)>max_len:
            review = review[:max_len]
        elif len(review)<max_len:
            for j in range(max_len-len(review)):
                review.append(zeros)
#                 print(len(review))
#         print(j)        
        x[i] = review
    return x

In [23]:
# vectorized_sample = pad_truncate(vectorized_sample[0])

In [24]:
# len(vectorized_sample)

In [25]:
# t = [np.array([1, 2, 3]), np.array([1, 4, 9])]

In [26]:
# len(t)

In [27]:
# t2 = pad_truncate(t)

In [28]:
# len(t2[1])

In [29]:
x_train_padded = pad_truncate(x_train)
x_test_padded = pad_truncate(x_test)

In [30]:
# len(x_train_padded[0])

In [31]:
# x_train[117]

In [32]:
# x_train_padded[0][117]

In [33]:
# type(x_train_padded)

In [34]:
batch_size = 32
maxlen = 400
token_size = 300
filters = 250
hidden_dims = 250
kernel_length = 3
epochs = 2

In [35]:
x_train_padded = np.reshape(x_train_padded, (len(x_train_padded), maxlen, token_size))
x_test_padded = np.reshape(x_test_padded, (len(x_test_padded), maxlen, token_size))

In [36]:
# x_train_padded = np.reshape(x_train, (len(x_train_padded), maxlen, embedding_dims))
# x_test_padded = np.reshape(x_test, (len(x_test_padded), maxlen, embedding_dims))

In [37]:
# print(x_train_padded.shape)
# x_test_padded.shape

In [38]:
y_train = np.array(y_train)

In [39]:
y_test = np.array(y_test)

In [40]:
x_train_padded.shape

(8192, 400, 300)

In [41]:
x_train_padded.shape

(8192, 400, 300)

## Building the model

In [42]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, LSTM

In [43]:
model = Sequential()

In [44]:
num_neurons = 50

In [45]:
model.add(LSTM(
                    num_neurons, 
                    return_sequences=True, # returns all time steps, not just last (just last is default)
                    input_shape = (maxlen, token_size)
            ))

In [46]:
??LSTM

In [47]:
model.add(Dropout(.3))
model.add(Flatten()) # from 400 50-long vectors to a single 400X50 long vector
model.add(Dense(1, activation='sigmoid'))

In [48]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 400, 50)           70200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 20001     
Total params: 90,201
Trainable params: 90,201
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])

In [50]:
model.fit(x_train_padded, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test_padded, y_test))

Train on 8192 samples, validate on 2048 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7f3cc861f9e8>

In [51]:
# free up memory
# x_train_padded = []
# x_test_padded = x_train_padded

In [52]:
# x_train = []

# Save model

In [53]:
import json
model_structure = model.to_json()
with open('lstm_model_structure.json', 'w') as model_json:
    model_json.write(model_structure)
model.save_weights('lstm_model_weights.h5')

In [54]:
# model = 0

# Load model

In [55]:
from keras.models import model_from_json
with open("lstm_model_structure.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('lstm_model_weights.h5')

In [56]:
# ## stateful
# stateful: SimpleRNN(stateful=True) lets the rnn remember the last state FROM THE PREVIOUS SAMPLE. Only use if the samples are related
# ## bidirectional
# from keras.models import Sequential

# from keras.layers import SimpleRNN

# from keras.layers.wrappers import Bidirectional

# > num_neurons = 10

# >>> maxlen = 100

# >>> embedding_dims = 300

# >>> model = Sequential()

# >>> model.add(Bidirectional(SimpleRNN(

# ...

# num_neurons, return_sequences=True),\

# ...

# input_shape=(maxlen, embedding_dims)))

# Prediction

In [57]:
sample = [""""I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."""]

In [58]:
# ??pad_truncate

In [59]:
sample = vectorize(sample)
sample = pad_truncate(sample, token_size, maxlen)

In [60]:
# type(vectorized_sample)

In [61]:
sample = np.reshape(sample, (len(sample), maxlen, token_size))

In [62]:
model.predict_proba(sample)

array([[0.25276127]], dtype=float32)

In [63]:
model.predict(sample)

array([[0.25276127]], dtype=float32)

In [64]:
model.predict_classes(sample)

array([[0]], dtype=int32)