In [1]:
from collections import Counter
from nltk.corpus import stopwords
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import LSTM, Dropout 
from keras.initializers import Constant
from keras.callbacks import EarlyStopping
import csv
from keras.layers import Bidirectional, GlobalMaxPool1D
import tensorflow as tf

Using TensorFlow backend.


In [2]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
path = './Data/labeledTrainData.tsv'
with open(path, encoding='utf-8') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    next(reader)
    pos_l = []
    neg_l = []
    for row in reader:
        if row[1] == '1':
            pos_l.append(row[2])
        else:
            neg_l.append(row[2])

In [5]:
train_val_pos = pos_l[:11250]
test_pos = pos_l[11250:]
train_val_neg = neg_l[:11250]
test_neg = neg_l[11250:]

# train_val_pos = pos_l[:900]
# test_pos = pos_l[900:1000]
# train_val_neg = neg_l[:900]
# test_neg = neg_l[900:1000]

In [6]:
# print(len(train_pos))
# print(len(val_pos))
# print(len(test_pos))
# print(len(train_neg))
# print(len(val_neg))
# print(len(test_neg))

In [7]:
# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load all docs in a directory
def process_docs(l, vocab):
    for doc in l:
        tokens = clean_doc(doc)
        vocab.update(tokens)

In [8]:
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs(train_val_pos, vocab)
process_docs(train_val_neg, vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

130812
[('br', 51650), ('The', 40001), ('movie', 37025), ('film', 33460), ('one', 20844), ('like', 17041), ('This', 13262), ('good', 12532), ('It', 10845), ('would', 10836), ('time', 10404), ('really', 10183), ('story', 9966), ('even', 9777), ('see', 9773), ('much', 8385), ('get', 8117), ('bad', 7630), ('people', 7616), ('great', 7444), ('made', 7100), ('first', 7072), ('well', 7021), ('also', 6918), ('films', 6863), ('make', 6836), ('movies', 6823), ('could', 6820), ('way', 6720), ('dont', 6593), ('characters', 6508), ('But', 6483), ('think', 6418), ('Its', 6051), ('And', 5996), ('seen', 5842), ('character', 5823), ('watch', 5656), ('many', 5639), ('two', 5570), ('never', 5544), ('acting', 5530), ('plot', 5466), ('little', 5389), ('know', 5353), ('In', 5320), ('best', 5146), ('show', 5141), ('love', 5132), ('life', 5117)]


In [9]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

62420


In [10]:
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w', encoding='utf-8')
    # write text
    file.write(data)
    # close file
    file.close()
 
# save tokens to a vocabulary file
save_list(tokens, './Data/vocab.txt')

In [11]:
# turn a doc into clean tokens
def clean_doc_2(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens
 
# load all docs in a directory
def process_docs_2(l, vocab):
    documents = list()
    for doc in l:
        tokens = clean_doc_2(doc, vocab)
        documents.append(tokens)
    return documents

In [12]:
# load the vocabulary
vocab_filename = './Data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [13]:
# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r', encoding="utf8")
    lines = file.readlines()
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding
 
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

In [14]:
# load all training reviews
positive_docs = process_docs_2(train_val_pos, vocab)
negative_docs = process_docs_2(train_val_neg, vocab)
train_val_docs = positive_docs + negative_docs

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_val_docs)

In [15]:
## Spliting into Train and Validation Dataset
train_pos = positive_docs[:10000]
val_pos = positive_docs[10000:]
train_neg = negative_docs[:10000]
val_neg = negative_docs[10000:]

# train_pos = positive_docs[:800]
# val_pos = positive_docs[800:]
# train_neg = negative_docs[:800]
# val_neg = negative_docs[800:]

# train_pos = positive_docs[:4000]
# val_pos = positive_docs[4000:]
# train_neg = negative_docs[:4000]
# val_neg = negative_docs[4000:]

In [16]:
# Generating Xtrain and ytrain
train_docs = train_pos + train_neg
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(10000)] + [1 for _ in range(10000)])

In [17]:
# Generating Xval and yval
val_docs = val_pos + val_neg
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(val_docs)
# pad sequences
Xval = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
yval = array([0 for _ in range(1250)] + [1 for _ in range(1250)])

In [18]:
# Generating Xtest and ytest
positive_docs = process_docs_2(test_pos, vocab)
negative_docs = process_docs_2(test_neg, vocab)
test_docs = positive_docs + negative_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(1250)] + [1 for _ in range(1250)])

In [19]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [20]:
# load embedding from file
raw_embedding = load_embedding('./Data/glove_6B/glove_6B_100d.txt')

In [21]:
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)

In [22]:
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, embeddings_initializer=Constant(embedding_vectors), input_length=max_length, trainable=True)

In [23]:
def get_RNN_model(lstm_units, dense_units, n_dropout, learning_rate):
    model = Sequential()
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(lstm_units, return_sequences = True)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(dense_units, activation="relu"))
    model.add(Dropout(n_dropout))
    model.add(Dense(1, activation="sigmoid"))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [24]:
es = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')

In [25]:
# fit network
n_epochs = [10]
learning_rate = [0.001] #[0.1, 0.001, 0.0001]
n_lstm_units = [16, 32, 64]
n_dense_units = [20, 40, 60]
n_dropout_rate = [0.2, 0.5, 0.8]
for epoch in n_epochs:
    for l_rate in learning_rate:
        for lstm_units in n_lstm_units:
            for dense_units in n_dense_units:
                for dropout_rate in n_dropout_rate:
                    print("Current Model: Epochs = {0}, l_rate = {1}, lstm_units = {2}, dense_units = {3}, dropout_rate = {4}".format(epoch, l_rate, lstm_units, dense_units, dropout_rate))
                    model = get_RNN_model(lstm_units, dense_units, dropout_rate, l_rate)
                    # fit network
                    model.fit(Xtrain, ytrain, epochs=epoch, verbose=1, validation_data = (Xval, yval), callbacks = [es])
                    # Save the model
                    model_name = "{4}_epochs_{0}_lrate_{1}_lstm_units_{2}_dense_units_{3}_dropout_rate_{4}".format(epoch, l_rate, lstm_units, dense_units, dropout_rate, "RNN")
                    model.save("./RNN_models_large/" + model_name + ".h5")

In [29]:
# Best Model
l_rate = 0.001
lstm_units = 16
dense_units = 20
dropout_rate = 0.2
model = get_RNN_model(lstm_units, dense_units, dropout_rate, l_rate)
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=1, validation_data = (Xval, yval))

Train on 20000 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13ba97c18d0>

In [None]:
# # evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))