In [1]:
from collections import Counter
from nltk.corpus import stopwords
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import LeakyReLU
from keras.callbacks import EarlyStopping
import csv
import tensorflow as tf
from keras.models import load_model

Using TensorFlow backend.


In [2]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
path = './Data/labeledTrainData.tsv'
with open(path, encoding='utf-8') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    next(reader)
    pos_l = []
    neg_l = []
    for row in reader:
        if row[1] == '1':
            pos_l.append(row[2])
        else:
            neg_l.append(row[2])

In [5]:
train_val_pos = pos_l[:4500]
test_pos = pos_l[4500:5000]
train_val_neg = neg_l[:4500]
test_neg = neg_l[4500:5000]

# train_val_pos = pos_l[:900]
# test_pos = pos_l[900:1000]
# train_val_neg = neg_l[:900]
# test_neg = neg_l[900:1000]

# train_val_pos = pos_l[:11250]
# test_pos = pos_l[11250:]
# train_val_neg = neg_l[:11250]
# test_neg = neg_l[11250:]

In [6]:
# print(len(train_val_pos))
# print(len(test_pos))
# print(len(train_val_neg))
# print(len(test_neg))

In [7]:
# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# load all docs in a directory
def process_docs(l, vocab):
    for doc in l:
        tokens = clean_doc(doc)
        vocab.update(tokens)

In [8]:
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs(train_val_pos, vocab)
process_docs(train_val_neg, vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

80049
[('br', 20759), ('The', 16056), ('movie', 14711), ('film', 13926), ('one', 8278), ('like', 6927), ('This', 5381), ('good', 5086), ('It', 4339), ('would', 4294), ('time', 4175), ('really', 4069), ('even', 3940), ('see', 3936), ('story', 3883), ('much', 3308), ('get', 3259), ('people', 3119), ('bad', 3021), ('great', 2988), ('made', 2841), ('well', 2835), ('films', 2810), ('make', 2803), ('movies', 2799), ('first', 2789), ('also', 2777), ('way', 2713), ('could', 2708), ('dont', 2669), ('think', 2630), ('But', 2596), ('characters', 2592), ('And', 2489), ('Its', 2430), ('character', 2359), ('seen', 2348), ('many', 2323), ('watch', 2256), ('never', 2252), ('two', 2251), ('plot', 2222), ('acting', 2207), ('little', 2127), ('In', 2118), ('know', 2116), ('best', 2073), ('life', 2064), ('ever', 2046), ('love', 2015)]


In [9]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

39193


In [10]:
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w', encoding='utf-8')
    # write text
    file.write(data)
    # close file
    file.close()
 
# save tokens to a vocabulary file
save_list(tokens, './Data/vocab.txt')

In [11]:
# turn a doc into clean tokens
def clean_doc_2(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens
 
# load all docs in a directory
def process_docs_2(l, vocab):
    documents = list()
    for doc in l:
        tokens = clean_doc_2(doc, vocab)
        documents.append(tokens)
    return documents

In [12]:
# load the vocabulary
vocab_filename = './Data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [13]:
# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r', encoding="utf8")
    lines = file.readlines()
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding
 
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

In [14]:
# load all training reviews
positive_docs = process_docs_2(train_val_pos, vocab)
negative_docs = process_docs_2(train_val_neg, vocab)
train_val_docs = positive_docs + negative_docs

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_val_docs)

In [15]:
## Spliting into Train and Validation Dataset
train_pos = positive_docs[:4000]
val_pos = positive_docs[4000:]

train_neg = negative_docs[:4000]
val_neg = negative_docs[4000:]

In [16]:
# print(len(train_pos))
# print(len(val_pos))
# print(len(train_neg))
# print(len(val_neg))

In [17]:
# Generating Xtrain and ytrain
train_docs = train_pos + train_neg
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(4000)] + [1 for _ in range(4000)])

In [18]:
# Generating Xval and yval
val_docs = val_pos + val_neg
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(val_docs)
# pad sequences
Xval = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
yval = array([0 for _ in range(500)] + [1 for _ in range(500)])

In [19]:
# Generating Xtest and ytest
positive_docs = process_docs_2(test_pos, vocab)
negative_docs = process_docs_2(test_neg, vocab)
test_docs = positive_docs + negative_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(500)] + [1 for _ in range(500)])

In [20]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [21]:
# load embedding from file
raw_embedding = load_embedding('./Data/glove_6B/glove_6B_100d.txt')

In [22]:
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)

In [23]:
# embedding_vectors.shape

In [24]:
# max_length

In [25]:
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=True)

In [26]:
def get_CNN_model(n_filters, n_kernel_size, n_pool_size, learning_rate, act_fn):
    model = Sequential()
    model.add(embedding_layer)
    if act_fn == 'leakyRelu':
        model.add(Conv1D(filters=n_filters, kernel_size=n_kernel_size))
        model.add(LeakyReLU(alpha=0.05))
    else:
        model.add(Conv1D(filters=n_filters, kernel_size=n_kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=n_pool_size))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
#     print(model.summary())
    return model

In [27]:
es = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')

In [28]:
# fit network
n_epochs = [10]
learning_rate = [0.001] #[0.1, 0.001, 0.0001]
act_fn = ['relu', 'leakyRelu']
n_pool_size = [2, 3, 4]
n_filters = [64, 128, 256]
n_kernel_size = [2, 3, 5]
for epoch in n_epochs:
    for a_fn in act_fn:
        for l_rate in learning_rate:
            for pool_size in n_pool_size:
                for filters in n_filters:
                    for kernel_size in n_kernel_size:
                        print("Current Model: Epochs = {0}, a_fn = {1}, l_rate = {2}, pool_size = {3}, filter = {4}, kernel = {5}".format(epoch, a_fn, l_rate, pool_size, filters, kernel_size))
                        model = get_CNN_model(filters, kernel_size, pool_size, l_rate, a_fn)
                        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
                        model.fit(Xtrain, ytrain, epochs=epoch, verbose=2, validation_data = (Xval, yval), callbacks = [es])
                        # Save the model
                        model_name = "{6}_epochs_{0}_a_fn_{1}_lrate_{2}_pool_size_{3}_filter_{4}_kernel_{5}".format(epoch, a_fn, l_rate, pool_size, filters, kernel_size, "CNN")
                        model.save("./CNN_models_medium/" + model_name + ".h5")
                        # Save model config as json
                        model_json = model.to_json()
                        with open("./CNN_models_medium/json/" + model_name + ".json", "w") as json_file:
                            json_file.write(model_json)

In [29]:
# best model parameters
pool_size = 2
kernel_size = 2
filters = 64
l_rate = 0.001
act_fn = 'leakyRelu'
# Load the model
large_model = load_model('./CNN_models_medium/CNN_epochs_10_a_fn_relu_lrate_0.001_pool_size_2_filter_256_kernel_2.h5')
loss, acc = large_model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 86.600000
