# Sentiment Analysis on IMDb Reviews

In [1]:
from __future__ import print_function, division

Download and extract the IMDB sentiment dataset:
    
    wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    tar xfz aclImdb_v1.tar.gz

In [2]:
# Vocabulary: All words used, starting by the most frequent
with open('aclImdb/imdb.vocab') as f:
    vocab = [word.rstrip() for word in f]
    # Keep only most frequent 5000 words rather than all 90000
    # Just saving memory - the long tail occurs too few times
    # for the model to learn anything anyway
    vocab = vocab[:5000]
    print('%d words in vocabulary' % (len(vocab),))

5000 words in vocabulary


In [3]:
import re

def text_tokens(text):
    text = text.lower()
    text = re.sub("\\s", " ", text)
    text = re.sub("[^a-zA-Z' ]", "", text)
    tokens = text.split(' ')
    return tokens

In [4]:
import os

def load_dataset(dirname):
    X, y = [], []
    # Review files: neg/0_3.txt neg/10000_4.txt neg/10001_4.txt ...
    for y_val, y_label in enumerate(['neg', 'pos']):
        y_dir = os.path.join(dirname, y_label)
        for fname in os.listdir(y_dir):
            fpath = os.path.join(y_dir, fname)
            print('\r' + fpath + '   ', end='')
            with open(fpath) as f:
                tokens = text_tokens(f.read())
            X.append(tokens)
            y.append(y_val)  # 0 for 'neg', 1 for 'pos'
    print()
    return X, y

In [5]:
X_train, y_train = load_dataset('aclImdb/train/')
X_val, y_val = load_dataset('aclImdb/test/')

aclImdb/train/pos/437_9.txt      
aclImdb/test/pos/1917_10.txt    


In [10]:
len(X_train), len(X_val)

(25000, 25000)

## Bag-of-words Linear Model

In [5]:
from keras.layers import Activation, Dense, Input
from keras.models import Model
import numpy as np

class BOWSentimentModel(object):
    def __init__(self):
        bow = Input(shape=(len(vocab),), name='bow_input')
        # weights of all inputs
        sentiment = Dense(1)(bow)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        self.model = Model(input=bow, output=sentiment)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def review_vector(self, tokens):
        vector = [0] * len(vocab)
        for t in tokens:
            try:
                vector[vocab.index(t)] = 1
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X])
        X_val = np.array([self.review_vector(r) for r in X_val])
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=25, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X])
        return self.model.predict(X)

Using TensorFlow backend.


In [7]:
sentiment = BOWSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])

  del sys.path[0]


Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.86290973]


## Bag-of-Words with Hidden Layer

In [6]:
from keras.layers import Activation, Dense, Input
from keras.models import Model
import numpy as np

class BOWHiddenSentimentModel(object):
    def __init__(self, N=64):
        bow = Input(shape=(len(vocab),), name='bow_input')
        # weights of all inputs
        hidden = Dense(N, activation='tanh')(bow)
        sentiment = Dense(1)(hidden)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        self.model = Model(input=bow, output=sentiment)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def review_vector(self, tokens):
        vector = [0] * len(vocab)
        for t in tokens:
            try:
                vector[vocab.index(t)] = 1
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X])
        X_val = np.array([self.review_vector(r) for r in X_val])
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=25, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X])
        return self.model.predict(X)

Using TensorFlow backend.


In [7]:
sentiment = BOWHiddenSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])

  


Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 1.]


## GloVe-based Model (let's start with averaging)

### Loading GloVe vocabulary

In [119]:
EMBEDDING_DIM = 50

In [120]:
# let's create a dictionary of each word in the pre-trained GloVe embeddings, saving its location indexes 
import os
GLOVE_DIR = "." # ./glove.6B/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.%dd.txt' % EMBEDDING_DIM))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [121]:
embedding_matrix = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
for i, word in enumerate(vocab):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        # also, [0] is reserved for padding
        embedding_matrix[i + 1] = embedding_vector

Checking how many words have no pre-trained GloVe word embeddings:

In [20]:
1. * np.count_nonzero(np.all(embedding_matrix == 0, axis=1)) / len(vocab)  # OOV portion

0.0076

Representing reviews as sequences: What's a good compromise for sequence length?

In [40]:
lengths = sorted([len(X) for X in X_train])
percentile = 0.66
seq_cutoff = lengths[int(len(lengths)*percentile)]
print('Average: %f, Median: %d, %d%% percentile: %d tokens' % (np.mean(lengths), lengths[int(len(lengths)*0.5)], percentile*100, seq_cutoff))

Average: 233.778560, Median: 174, 66% percentile: 232 tokens


### GloVe averaging model

Questions:
  * Predict sentiment from mean embedding, or mean sentiment from each embedding?
  * `trainable=True`?
  * Projection to a "sentiment predictive" space first?

In [48]:
from keras.layers import Activation, GlobalAveragePooling1D, Dense, Embedding, Input
from keras.models import Model
import numpy as np

class GloveAvgSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64):
        self.seq_len = seq_len
        
        self.model = self.create(N)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        avg_embedded = GlobalAveragePooling1D()(seq_embedded)
        sentiment = Dense(1)(avg_embedded)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=25, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [49]:
sentiment = GloveAvgSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.66820341]


In [50]:
from keras.layers import Activation, GlobalAveragePooling1D, Dense, Embedding, Input
from keras.models import Model
import numpy as np

class GloveSentimentAvgModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64):
        self.seq_len = seq_len
        
        self.model = self.create(N)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_sentiment = Dense(1)(seq_embedded)
        # normalize to [0, 1] range
        seq_sentiment = Activation('sigmoid')(seq_sentiment)
        sentiment = GlobalAveragePooling1D()(seq_sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=25, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [51]:
sentiment = GloveSentimentAvgModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.57322681]


In [55]:
from keras.layers import Activation, GlobalMaxPooling1D, Dense, Embedding, Input
from keras.models import Model
import numpy as np

class GloveHiddenMaxSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64):
        self.seq_len = seq_len
        
        self.model = self.create(N)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(seq_embedded)
        max_hidden = GlobalMaxPooling1D()(seq_hidden)
        sentiment = Dense(1)(max_hidden)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=25, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [56]:
sentiment = GloveHiddenMaxSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25

KeyboardInterrupt: 

In [58]:
from keras.layers import Activation, Dense, Embedding, GRU, Input
from keras.models import Model
import numpy as np

class GloveHiddenGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64):
        self.seq_len = seq_len
        
        self.model = self.create(N)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(seq_embedded)
        recursive_repr = GRU(N)(seq_hidden)
        sentiment = Dense(1)(recursive_repr)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=5, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [59]:
sentiment = GloveHiddenGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.94238299]


In [71]:
from keras.layers import Activation, Conv1D, Dense, Embedding, GlobalMaxPooling1D, Input
from keras.models import Model
import numpy as np

class GloveHiddenCNNSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(seq_embedded)
        seq_conv = Conv1D(N, size)(seq_hidden)
        max_conv = GlobalMaxPooling1D()(seq_conv)
        sentiment = Dense(1)(max_conv)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=10, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [72]:
sentiment = GloveHiddenCNNSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.98872828]


In [77]:
from keras.layers import Activation, Conv1D, Dense, Embedding, GRU, Input
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = seq_embedded  # Dense(N, activation='tanh')(seq_embedded)
        seq_conv = Conv1D(N, size, activation='tanh')(seq_hidden)
        recursive_repr = GRU(N)(seq_conv)
        sentiment = Dense(1)(recursive_repr)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=10, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [None]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])

In [122]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Input, concatenate, add
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh', padding='same')(Dropout(0.2)(seq_hidden))
        seq_hidden = add([seq_hidden, seq_conv])
        recursive_repr = GRU(N)(Dropout(0.2)(seq_hidden))
        sentiment = Dense(1)(Dropout(0.2)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [123]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.96565175]


In [124]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Input, concatenate, add
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=96, size=5):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh', padding='same')(Dropout(0.2)(seq_hidden))
        seq_hidden = add([seq_hidden, seq_conv])
        recursive_repr = GRU(N)(Dropout(0.2)(seq_hidden))
        sentiment = Dense(1)(Dropout(0.2)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [125]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.96519709]


In [79]:
EMBEDDING_DIM = 300

In [80]:
# let's create a dictionary of each word in the pre-trained GloVe embeddings, saving its location indexes 
import os
GLOVE_DIR = "." # ./glove.6B/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.%dd.txt' % EMBEDDING_DIM))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [81]:
embedding_matrix = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
for i, word in enumerate(vocab):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        # also, [0] is reserved for padding
        embedding_matrix[i + 1] = embedding_vector

In [97]:
from keras.layers import Activation, Conv1D, Dense, Embedding, GRU, Input
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = seq_embedded  # Dense(N, activation='tanh')(seq_embedded)
        seq_conv = Conv1D(N, size, activation='tanh')(seq_hidden)
        recursive_repr = GRU(N)(seq_conv)
        sentiment = Dense(1)(recursive_repr)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=10, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [98]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.98546731]


In [99]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GRU, Input
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(seq_embedded)
        seq_conv = Conv1D(N, size, activation='tanh')(seq_hidden)
        recursive_repr = GRU(N)(seq_conv)
        sentiment = Dense(1)(recursive_repr)
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=10, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [100]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.93981749]


In [101]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GRU, Input
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh')(Dropout(0.2)(seq_hidden))
        recursive_repr = GRU(N)(Dropout(0.2)(seq_conv))
        sentiment = Dense(1)(Dropout(0.2)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [102]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.91673148]


In [103]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GRU, Input
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=True)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh')(Dropout(0.2)(seq_hidden))
        recursive_repr = GRU(N)(Dropout(0.2)(seq_conv))
        sentiment = Dense(1)(Dropout(0.2)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [104]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.99751067]


In [105]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GRU, Input
from keras.models import Model
from keras import regularizers
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=True,
                                 embeddings_regularizer=regularizers.l1(1e-4))(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh')(Dropout(0.2)(seq_hidden))
        recursive_repr = GRU(N)(Dropout(0.2)(seq_conv))
        sentiment = Dense(1)(Dropout(0.2)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [106]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.57911283]


In [107]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GRU, Input
from keras.models import Model
from keras import regularizers
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=True)(seq_indices)
        seq_hidden = Dense(N, activation='tanh', kernel_regularizer=regularizers.l2(1e-4))(Dropout(0.5)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh', kernel_regularizer=regularizers.l2(1e-4))(Dropout(0.5)(seq_hidden))
        recursive_repr = GRU(N)(Dropout(0.5)(seq_conv))
        sentiment = Dense(1)(Dropout(0.5)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [108]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.49985394]


In [111]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Input, concatenate, add
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh', padding='same')(Dropout(0.2)(seq_hidden))
        recursive_repr = GRU(N)(Dropout(0.2)(seq_hidden))  # !!!!!!!!!!!!!!
        full_repr = concatenate([GlobalAveragePooling1D()(seq_hidden), GlobalMaxPooling1D()(seq_conv), recursive_repr])
        sentiment = Dense(1)(Dropout(0.2)(full_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [112]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.9398315]


In [113]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Input, concatenate, add
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh', padding='same')(Dropout(0.2)(seq_hidden))
        recursive_repr = GRU(N)(Dropout(0.2)(seq_hidden))  # !!!!!!!!!!!!!!
        full_repr = add([GlobalAveragePooling1D()(seq_hidden), GlobalMaxPooling1D()(seq_conv), recursive_repr])
        sentiment = Dense(1)(Dropout(0.2)(full_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [114]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.98081231]


In [115]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Input, concatenate, add
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=64, size=3):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh', padding='same')(Dropout(0.2)(seq_hidden))
        seq_hidden = add([seq_hidden, seq_conv])
        recursive_repr = GRU(N)(Dropout(0.2)(seq_hidden))
        sentiment = Dense(1)(Dropout(0.2)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [116]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.90629309]


In [117]:
from keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, Input, concatenate, add
from keras.models import Model
import numpy as np

class GloveCNNGRUSentimentModel(object):
    def __init__(self, seq_len=seq_cutoff, N=96, size=5):
        self.seq_len = seq_len
        
        self.model = self.create(N, size)
        self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
    def create(self, N, size):
        seq_indices = Input(shape=(self.seq_len,), name='seq_input')                    
        seq_embedded = Embedding(input_dim=len(vocab) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                                 input_length=self.seq_len, trainable=False)(seq_indices)
        seq_hidden = Dense(N, activation='tanh')(Dropout(0.2)(seq_embedded))
        seq_conv = Conv1D(N, size, activation='tanh', padding='same')(Dropout(0.2)(seq_hidden))
        seq_hidden = add([seq_hidden, seq_conv])
        recursive_repr = GRU(N)(Dropout(0.2)(seq_hidden))
        sentiment = Dense(1)(Dropout(0.2)(recursive_repr))
        # normalize to [0, 1] range
        sentiment = Activation('sigmoid')(sentiment)

        return Model(input=seq_indices, output=sentiment)
        
    def review_vector(self, tokens):
        vector = [0] * self.seq_len
        if len(tokens) > self.seq_len:
            # Remove the middle
            tokens = tokens[: self.seq_len // 2] + ['SINGLE_PADDING_IN_THE_MIDDLE'] + tokens[-self.seq_len // 2 :]
        for i, t in enumerate(tokens):
            try:
                vector[i] = vocab.index(t) + 1  # reserving 0 for padding
            except:
                pass  # ignore missing words
        return vector

    def train(self, X, y, X_val, y_val):
        print('Vectorizing...')
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        X_val = np.array([self.review_vector(r) for r in X_val], dtype='int32')
        print('Fitting...')
        self.model.fit(X, y, validation_data=(X_val, y_val), epochs=15, verbose=1)

    def predict(self, X):
        X = np.array([self.review_vector(r) for r in X], dtype='int32')
        return self.model.predict(X)

In [118]:
sentiment = GloveCNNGRUSentimentModel()
sentiment.train(X_train, y_train, X_val, y_val)

test_text = 'Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story.'
test_tokens = text_tokens(test_text)
print(test_text, sentiment.predict([test_tokens])[0])



Vectorizing...
Fitting...
Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Good story about a backwoods community in the Ozarks around the turn of the century. Moonshine is the leading industry, fighting and funning the major form of entertainment. One day a stranger enters the community and causes a shake-up among the locals. Beautiful scenery adds much to the story. [ 0.89099598]


### 