# Preprocessing

In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, GlobalMaxPooling1D, Conv1D, Activation


Using TensorFlow backend.


In [2]:
import glob
import os
from random import shuffle
import numpy as np
np.random.seed(42)

In [3]:
def get_data(filepath):
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    positive_label = 1
    negative_label = 0
    data = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as file:
            data.append((positive_label, file.read()))
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as file:
            data.append((negative_label, file.read()))
    shuffle(data)
    return data

In [4]:
imdb_path = os.path.join(os.getcwd(), 'aclImdb/train')

In [5]:
labeled_data_long = get_data(imdb_path)

In [6]:
labeled_data = labeled_data_long [:10240]

In [7]:
labeled_data[0]

(0,
 "I thought before starting with these movie that it might be a good one, but when i started with it i found it really awful. They said movie is being made in Afghanistan but i think 95% of the movie is shot in India. you can see Indian made cars. you can see lars drinking bisleri(an Indian water brand), Hindi written on the road, you can also see temples in Afghanistan *hahah* its really funny and many more stuff which proves its not shot in Afghanistan. I think one should not waste his/her time watching this movie.. pure time waste.. i would recommend to do something else instead of watching this movie or may be might heart is better idea but don't watch this waste of time")

# word vectors

In [8]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors
import copy
import numpy as np
import pandas as pd

In [None]:
word_vectors = KeyedVectors.load_word2vec_format('../word vectors/word2vec/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=500000)

In [None]:
def vectorize(dataset):
#     tokenized_dataset = []
    tokenizer = TreebankWordTokenizer()
    entries = []
    for entry in dataset:
        tokens = tokenizer.tokenize(entry)
        token_vectors = []
        for token in tokens:
            try:
                token_vectors.append(word_vectors[token])
            except KeyError:
                pass
        entries.append(token_vectors)
#         tokenized_dataset.append((dataset[0], tokenizer.tokenize(dataset[1])))
    return entries
        
        

In [None]:
# !pwd

In [None]:
# labeled_data[0][1]

In [None]:
# sample = vectorize(labeled_data[:3])

In [None]:
# len(sample[2])

In [None]:
reviews = [review[1] for review in labeled_data]

In [None]:
vectorized_reviews = vectorize(reviews)
labels = [ld[0] for ld in labeled_data]

In [None]:
# len(vectorized_reviews[0])
vectorized_sample = vectorize([( """"I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend.""")])

In [None]:
len(vectorized_sample[0])

In [None]:

# np.mean(labels)

In [None]:
split_point = int(len(labels)*.8)

In [None]:
x_train, x_test = vectorized_reviews[:split_point], vectorized_reviews[split_point:]
y_train, y_test = labels[:split_point], labels[split_point:]

# padding

In [None]:
def pad_truncate(x1, token_size=300, max_len=400):
    x = copy.copy(x1)
    zeros = np.zeros(token_size)
    for i, review in enumerate(x):
        review = list(review)
        if len(review)>max_len:
            review = review[:max_len]
        elif len(review)<max_len:
            for j in range(max_len-len(review)):
                review.append(zeros)
#                 print(len(review))
#         print(j)        
        x[i] = review
    return x

In [None]:
# vectorized_sample = pad_truncate(vectorized_sample[0])

In [None]:
# len(vectorized_sample)

In [None]:
# t = [np.array([1, 2, 3]), np.array([1, 4, 9])]

In [None]:
# len(t)

In [None]:
# t2 = pad_truncate(t)

In [None]:
# len(t2[1])

In [None]:
x_train_padded = pad_truncate(x_train)
x_test_padded = pad_truncate(x_test)

In [None]:
# len(x_train_padded[0])

In [None]:
# x_train[117]

In [None]:
# x_train_padded[0][117]

In [None]:
# type(x_train_padded)

In [None]:
batch_size = 32
maxlen = 400
token_size = 300
filters = 250
hidden_dims = 250
kernel_length = 3
epochs = 2

In [None]:
x_train_padded = np.reshape(x_train_padded, (len(x_train_padded), maxlen, token_size))
x_test_padded = np.reshape(x_test_padded, (len(x_test_padded), maxlen, token_size))

In [None]:
# x_train_padded = np.reshape(x_train, (len(x_train_padded), maxlen, embedding_dims))
# x_test_padded = np.reshape(x_test, (len(x_test_padded), maxlen, embedding_dims))

In [None]:
# print(x_train_padded.shape)
# x_test_padded.shape

In [None]:
y_train = np.array(y_train)

In [None]:
y_test = np.array(y_test)

In [None]:
x_train_padded.shape

In [None]:
x_train_padded.shape

## Building the model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, LSTM

In [None]:
model = Sequential()

In [None]:
num_neurons = 50

In [None]:
model.add(LSTM(
                    num_neurons, 
                    return_sequences=True, # returns all time steps, not just last (just last is default)
                    input_shape = (maxlen, token_size)
            ))

In [None]:
??LSTM

In [None]:
model.add(Dropout(.3))
model.add(Flatten()) # from 400 50-long vectors to a single 400X50 long vector
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x_train_padded, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test_padded, y_test))

In [None]:
# free up memory
# x_train_padded = []
# x_test_padded = x_train_padded

In [None]:
# x_train = []

# Save model

In [None]:
import json
model_structure = model.to_json()
with open('lstm_model_structure.json', 'w') as model_json:
    model_json.write(model_structure)
model.save_weights('lstm_model_weights.h5')

In [None]:
# model = 0

# Load model

In [None]:
from keras.models import model_from_json
with open("lstm_model_structure.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('lstm_model_weights.h5')

In [None]:
# ## stateful
# stateful: SimpleRNN(stateful=True) lets the rnn remember the last state FROM THE PREVIOUS SAMPLE. Only use if the samples are related
# ## bidirectional
# from keras.models import Sequential

# from keras.layers import SimpleRNN

# from keras.layers.wrappers import Bidirectional

# > num_neurons = 10

# >>> maxlen = 100

# >>> embedding_dims = 300

# >>> model = Sequential()

# >>> model.add(Bidirectional(SimpleRNN(

# ...

# num_neurons, return_sequences=True),\

# ...

# input_shape=(maxlen, embedding_dims)))

# Prediction

In [None]:
sample = [""""I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."""]

In [None]:
# ??pad_truncate

In [None]:
sample = vectorize(sample)
sample = pad_truncate(sample, token_size, maxlen)

In [None]:
# type(vectorized_sample)

In [None]:
sample = np.reshape(sample, (len(sample), maxlen, token_size))

In [None]:
model.predict_proba(sample)

In [None]:
model.predict(sample)

In [None]:
model.predict_classes(sample)