# Movie Review Sentiment Word Vector Performance
This notebook uses word vectors to create a binary sentiment classification model.  The data for this example comes from [Andrew Maas](http://ai.stanford.edu/~amaas/data/sentiment/).  The word vectors come from Google or Stanford/Glove

### Download Text Data

In [1]:
import os, urllib, tarfile

In [2]:
DATA_URL = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
DATA_DIR = './data'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

if not os.path.isfile(os.path.join(DATA_DIR,'movie_data.tar.gz')):
    urllib.request.urlretrieve(DATA_URL, os.path.join(DATA_DIR,'movie_data.tar.gz'))
else:
    print("Data already downloaded.")

if os.path.isfile(os.path.join(DATA_DIR,'movie_data.tar.gz')) and not os.path.exists(os.path.join(DATA_DIR,'aclImdb')):
    f = tarfile.open(os.path.join(DATA_DIR,'movie_data.tar.gz'))
    f.extractall(path=DATA_DIR)
    f.close()
else:
    print("Tar file already extracted.")

Data already downloaded.
Tar file already extracted.


### Download Pretrained Embeddings
* [Google News](https://code.google.com/archive/p/word2vec/): Download the GoogleNews-vectors-negative300.bin file and extract it to the local ./wordvectors directory
* [Glove Vectors](https://nlp.stanford.edu/projects/glove/): Download any of the pre-trained word vectors and unzip into the local ./wordvectors directory
* Other models and data can be found at the [Gensim github page](https://github.com/RaRe-Technologies/gensim-data).

### Create Train/Test Dataframes

In [2]:
import numpy as np
import pandas as pd

In [4]:
TRAIN_DATA_FOLDER = 'data/aclImdb/train/'
TEST_DATA_FOLDER = 'data/aclImdb/test/'

In [5]:
def create_dataframe_from_files(data_folder):
    examples = list()
    for d in ['pos','neg']:
        for f in os.listdir(os.path.join(data_folder,d)):
            _tmp = open(os.path.join(data_folder,d,f),'r', encoding='utf-8')
            if d=='pos':
                examples += [(_tmp.read(),f,1)]
            else:
                examples += [(_tmp.read(),f,0)]
    df_tmp = pd.DataFrame(examples, columns=['text','file','target'])
    df_tmp = df_tmp.sample(frac=1)
    df_tmp = df_tmp.reset_index(drop=True)
    return df_tmp
                
df_train = create_dataframe_from_files(TRAIN_DATA_FOLDER)
df_test = create_dataframe_from_files(TEST_DATA_FOLDER)

print(df_train.shape)
print(df_test.shape)

(25000, 3)
(25000, 3)


### Create Tokenizer

In [3]:
import spacy, re

In [7]:
nlp = spacy.load('en_core_web_sm')

In [46]:
def tokenize_review(review_text):
    review_text = re.sub('[^A-Za-z0-9.?!\', ]+', ' ', review_text)
    d = nlp(review_text)
    return [token.text.lower() for sentence in d.sents for token in sentence]

#### Alternative Faster, Simpler Tokenizer Option

In [None]:
from keras.preprocessing import text, sequence

In [None]:
tokenizer = text.Tokenizer(lower=True, split=' ')
tokenizer.fit_on_texts(df_train['text'].apply(lambda x: x.replace("\'","")).values)
tokenized_sequence_train = tokenizer.texts_to_sequences(df_train['text'].apply(lambda x: x.replace("\'","")).values)
tokenized_sequence_test = tokenizer.texts_to_sequences(df_test['text'].apply(lambda x: x.replace("\'","")).values)

### Tokenize and Pad Text

In [4]:
from keras.preprocessing import text, sequence
from collections import Counter

Using TensorFlow backend.


In [5]:
MAX_LEN = 200

In [49]:
df_train['tokenized_text'] = df_train['text'].apply(lambda x: tokenize_review(x))
df_test['tokenized_text'] = df_test['text'].apply(lambda x: tokenize_review(x))

In [50]:
df_train.to_pickle('df_train.pkl')
df_test.to_pickle('df_test.pkl')

In [6]:
df_train = pd.read_pickle('df_train.pkl')
df_test = pd.read_pickle('df_test.pkl')

In [7]:
all_tokens = df_train['tokenized_text'].values

In [8]:
vocab_count = freq = Counter(p for o in all_tokens for p in o)
print('Vocab length: {}'.format(len(vocab_count)))
vocab_count.most_common(25)

Vocab length: 77035


[('the', 336600),
 (',', 275749),
 ('.', 275389),
 (' ', 229497),
 ('and', 164100),
 ('a', 162748),
 ('of', 145856),
 ('to', 135714),
 ('is', 110498),
 ('br', 101872),
 ('it', 96337),
 ('in', 93954),
 ('i', 87180),
 ('this', 75946),
 ('that', 73612),
 ("'s", 62289),
 ('was', 50514),
 ('as', 46934),
 ('for', 44331),
 ('with', 44125),
 ('movie', 43987),
 ('but', 42573),
 ('film', 40140),
 ('you', 34210),
 ('on', 34199)]

In [9]:
VOCAB_LIMIT = 100000
VOCAB_FREQ_MIN = 2

vocab_list = [o for o, c in vocab_count.most_common(VOCAB_LIMIT) if c > VOCAB_FREQ_MIN]
vocab_list.insert(0, '<unk>')
vocab_list.insert(0, '<pad>')
int2word_hash = {i:w for i, w in enumerate(vocab_list)}
word2int_hash = {v:k for k, v in int2word_hash.items()}

In [10]:
len(int2word_hash)

38212

In [11]:
tokenized_sequence_train = list()

for item in df_train['tokenized_text'].values:
    tokenized_sequence_train.append([word2int_hash.get(x,1) for x in item])

In [12]:
tokenized_sequence_test = list()

for item in df_test['tokenized_text'].values:
    tokenized_sequence_test.append([word2int_hash.get(x,1) for x in item])

In [13]:
np.random.seed(seed=42)
train_index = np.random.choice(range(0,len(tokenized_sequence_train)), size=int(0.95*len(tokenized_sequence_train)), replace=False)
val_index = np.setdiff1d(range(0, len(tokenized_sequence_train)), train_index)

In [14]:
X = sequence.pad_sequences(tokenized_sequence_train, maxlen=MAX_LEN)
X_train = X[train_index]
X_val = X[val_index]
X_test = sequence.pad_sequences(tokenized_sequence_test, maxlen=MAX_LEN)
y = np.vstack(df_train['target'].apply(lambda x: np.array([0,1]) if x == 1 else np.array([1,0])).values)
y_train = y[train_index]
y_val = y[val_index]
y_test = np.vstack(df_test['target'].apply(lambda x: np.array([0,1]) if x == 1 else np.array([1,0])).values)
y_test_rand = np.vstack(df_test['target'].apply(lambda x: np.array([0,1]) if np.random.choice([0,1], p=[df_train['target'].mean(), 1-df_train['target'].mean()]) == 1 else np.array([1,0])).values)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(23750, 200)
(1250, 200)
(25000, 200)
(23750, 2)
(1250, 2)
(25000, 2)


In [127]:
np.save('X_train', X_train)
np.save('X_val', X_val)
np.save('X_test', X_test)
np.save('y_train', y_train)
np.save('y_val', y_val)
np.save('y_test', y_test)

### Load Word Vector Model

In [15]:
import gensim

#### Load GoogleNews Vectors
Loading the GoogleNews vectors as a model is straighforward when using the gensim library

In [16]:
if os.path.isfile('wordvectors/GoogleNews-vectors-negative300.bin'):
    wv_model = gensim.models.KeyedVectors.load_word2vec_format('wordvectors/GoogleNews-vectors-negative300.bin', binary=True)
else:
    print('GoogleNews vectors not downloaded and extracted. Please see documentation above.')

In [23]:
len(wv_model.wv.vocab)

3000000

In [24]:
wv_model.wv.word_vec('hello')

array([-0.05419922,  0.01708984, -0.00527954,  0.33203125, -0.25      ,
       -0.01397705, -0.15039062, -0.265625  ,  0.01647949,  0.3828125 ,
       -0.03295898, -0.09716797, -0.16308594, -0.04443359,  0.00946045,
        0.18457031,  0.03637695,  0.16601562,  0.36328125, -0.25585938,
        0.375     ,  0.171875  ,  0.21386719, -0.19921875,  0.13085938,
       -0.07275391, -0.02819824,  0.11621094,  0.15332031,  0.09082031,
        0.06787109, -0.0300293 , -0.16894531, -0.20800781, -0.03710938,
       -0.22753906,  0.26367188,  0.012146  ,  0.18359375,  0.31054688,
       -0.10791016, -0.19140625,  0.21582031,  0.13183594, -0.03515625,
        0.18554688, -0.30859375,  0.04785156, -0.10986328,  0.14355469,
       -0.43554688, -0.0378418 ,  0.10839844,  0.140625  , -0.10595703,
        0.26171875, -0.17089844,  0.39453125,  0.12597656, -0.27734375,
       -0.28125   ,  0.14746094, -0.20996094,  0.02355957,  0.18457031,
        0.00445557, -0.27929688, -0.03637695, -0.29296875,  0.19

In [70]:
wv_model.most_similar(positive=['hello'], topn=3)

[('hi', 0.6548984050750732),
 ('goodbye', 0.639905571937561),
 ('howdy', 0.6310957074165344)]

In [71]:
wv_model.most_similar(positive=['world'], topn=3)

[('globe', 0.6945997476577759),
 ('theworld', 0.6902236342430115),
 ('country', 0.5980385541915894)]

In [72]:
wv_model.most_similar(positive=['Chicago'], topn=5)

[('Windy_City', 0.7112735509872437),
 ('Milwaukee', 0.6598065495491028),
 ('St._Louis', 0.6426857113838196),
 ('Peoria', 0.636687695980072),
 ('Naperville', 0.6306549310684204)]

In [65]:
wv_model.most_similar(positive=['movie'], topn=5)

[('film', 0.8676770925521851),
 ('movies', 0.8013108968734741),
 ('films', 0.7363011837005615),
 ('moive', 0.6830361485481262),
 ('Movie', 0.6693680286407471)]

In [73]:
wv_model.most_similar(positive=['king','woman'], negative=['man'], topn=1)

[('queen', 0.7118192911148071)]

#### Convert and Load Glove Vectors
Loading the Glove vectors using gensim takes one additional step to transform the txt file into the word2vec format; this is easily accomplished using the glove2word2vec script.  Glove vectors also come in various dimensions (50, 100, 200, 300), so you can pick the corresponding text file to load and covnvert based on need.

In [19]:
from gensim.scripts.glove2word2vec import glove2word2vec

In [20]:
glove_files = ['./wordvectors/glove.6B.50d.txt', './wordvectors/glove.6B.100d.txt', './wordvectors/glove.6B.200d.txt', './wordvectors/glove.6B.300d.txt']

for f in glove_files:
    print('Converting: {}'.format(f))
    glove_file = f
    tmp_file = f[:-4] + '.w2v'
    glove2word2vec(glove_file, tmp_file)

Converting: ./wordvectors/glove.6B.50d.txt
Converting: ./wordvectors/glove.6B.100d.txt
Converting: ./wordvectors/glove.6B.200d.txt
Converting: ./wordvectors/glove.6B.300d.txt


In [21]:
if os.path.isfile('wordvectors/wordvectors/glove.6B.50d.w2v'):
    wv_model = gensim.models.KeyedVectors.load_word2vec_format('wordvectors/glove.6B.50d.w2v')
else:
    print('Glove vectors not downloaded and converted. Please see documentation above.')

In [22]:
wv_model.wv.word_vec('hello')

array([-0.38497001,  0.80092001,  0.064106  , -0.28354999, -0.026759  ,
       -0.34531999, -0.64253002, -0.11729   , -0.33256999,  0.55242997,
       -0.087813  ,  0.90350002,  0.47102001,  0.56656998,  0.69849998,
       -0.35229   , -0.86541998,  0.90573001,  0.03576   , -0.071705  ,
       -0.12327   ,  0.54922998,  0.47005001,  0.35572001,  1.26110005,
       -0.67580998, -0.94983   ,  0.68665999,  0.38710001, -1.34920001,
        0.63511997,  0.46416   , -0.48813999,  0.83827001, -0.92460001,
       -0.33722001,  0.53741002, -1.06159997, -0.081403  , -0.67110997,
        0.30923   , -0.39230001, -0.55001998, -0.68826997,  0.58048999,
       -0.11626   ,  0.013139  , -0.57653999,  0.048833  ,  0.67203999], dtype=float32)

### Create Embedding Matrix

We need to create a bank of word vectors that are associated with the integer hashes used in the tokenizer.  To do this we loop through the tokenizer vocabulary, and try to find the word in our word2vec model.  If it exists, we add the vector to an embedding matrix in the row corresponding to the integer has in the tokenizer.  If it does not exist, then we fill in that row with a random vector.  The end result is an embedding weights matrix that is of the shape (vocab_size + 1, word_embedding_size).  The additional row on the vocab is for index 0, which represents a special padding value.

In [17]:
def create_embedding_matrix(wv_model, index_word_dict, max_len=None, verbose=False):
    EMBED_SIZE = wv_model.vector_size
    unknown_word_count = 0
    unknown_word_list = list()
    
    def choose_embedded_vector(wv_model, word, unknown_word_count, verbose=verbose):
        if word in wv_model.wv.vocab:
            return wv_model.wv.word_vec(word), unknown_word_count
        else:
            if verbose:
                unknown_word_list.append(word)
            return np.random.uniform(low=-0.05, high=0.05, size=EMBED_SIZE), (unknown_word_count+1)

    word_index_dict = {v:k for k, v in index_word_dict.items()}
    num_words = max_len + 1 if max_len else len(index_word_dict) + 1


    embedding_weights = np.zeros((num_words, EMBED_SIZE))
    for word, index in word_index_dict.items():
        if max_len:
            if index <= max_len:
                embedding_weights[index,:], unknown_word_count = choose_embedded_vector(wv_model, word, unknown_word_count, verbose=verbose)
        else:
            embedding_weights[index,:], unknown_word_count = choose_embedded_vector(wv_model, word, unknown_word_count, verbose=verbose)
    
    if verbose:
        print('The size of the keras token vocab is: {}'.format(num_words))
        print('The size of the word vector vocab is: {}'.format(EMBED_SIZE))
        print('Embedding matrix shape: {}'.format(embedding_weights.shape))
        print('Total amount of words not found in gensim word2vec model: {}'.format(unknown_word_count))
        print('The words not found in gensim word2vec model: {}'.format(str(unknown_word_list)))
        
    
    return embedding_weights

In [18]:
embedding_weights = create_embedding_matrix(wv_model, int2word_hash, verbose=True)

The size of the keras token vocab is: 38213
The size of the word vector vocab is: 300
Embedding matrix shape: (38213, 300)
Total amount of words not found in gensim word2vec model: 7266
The words not found in gensim word2vec model: ['<pad>', '<unk>', ',', '.', ' ', 'and', 'a', 'of', 'to', "'s", '!', '  ', "'", '?', '...', '10', '..', '   ', 'mr.', '20', '30', '80', 'dr.', '90', '15', '100', 'humour', '70', '50', '40', '11', 'favourite', '12', 'theatre', 'keaton', '80s', '70s', '60', 'tarzan', 'ms.', '13', 'welles', 'sinatra', 'i.e.', 'u.s.', '14', 'hitchcock', 'columbo', 'pacino', 'mrs.', 'lugosi', 'branagh', '25', '60s', '1950', '17', 'sandler', '1980', '2000', '1970', 'bettie', '2006', 'stanwyck', '16', 'brando', 'wwii', 'grey', 'matthau', '2001', '1950s', 'seagal', 'widmark', 'spielberg', 'elvira', '18', 'streisand', 'karloff', 'flynn', '24', 'walken', 'denzel', 'eastwood', '2005', 'colour', 'cusack', 'heston', 'mst3', 'ritter', '1990', 'j.', 'astaire', 'miike', '1970s', 'carrey', '

### Generate Model Architecture

In [19]:
from keras.models import Model, load_model
from keras.layers import Input, Embedding, GlobalMaxPool1D, GlobalAvgPool1D, Dense, Dropout, Conv1D, concatenate, LSTM, Bidirectional, BatchNormalization, Layer
from keras.regularizers import l2
from keras import initializers, regularizers, constraints
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD, Adam
from keras import backend as K

In [48]:
EMBED_SIZE = wv_model.vector_size
N_CLASSES = y_train.shape[1]
MAX_FEATURES = embedding_weights.shape[0]
HIDDEN_LAYER_SIZE = 128
CONV_FILTER_SIZE = 128
CONV_KERNEL_SIZE = 2


def create_model(MAX_LEN, MAX_FEATURES, EMBED_SIZE, HIDDEN_LAYER_SIZE, N_CLASSES, embedding_weights=None,
                 embedding_trainable=False, global_max_avg = 'max', learning_rate=0.0001):
    input_sequence = Input(shape=(MAX_LEN, ))
    
    x = Embedding(input_dim=MAX_FEATURES, output_dim=EMBED_SIZE, mask_zero=False, weights=[embedding_weights],
                  trainable=embedding_trainable)(input_sequence)
    x = Conv1D(CONV_FILTER_SIZE, CONV_KERNEL_SIZE, padding='same')(x)
    if global_max_avg == 'max':
        x = GlobalMaxPool1D()(x)
    else:
        x = GlobalAvgPool1D()(x)
    x = Dense(HIDDEN_LAYER_SIZE, activation='relu')(x)
    prediction = Dense(N_CLASSES, activation='softmax')(x)
    
    opt = Adam(lr=learning_rate)
    
    model = Model(inputs=input_sequence, outputs=prediction)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    return model

In [49]:
model = create_model(MAX_LEN, MAX_FEATURES, EMBED_SIZE, HIDDEN_LAYER_SIZE, N_CLASSES,
                     embedding_weights=embedding_weights, embedding_trainable=False, global_max_avg = 'max', 
                     learning_rate=0.001)

In [50]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 300)          11463900  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 200, 128)          76928     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
Total params: 11,557,598
Trainable params: 93,698
Non-trainable params: 11,463,900
___________________________________________________________

### Train Model

In [51]:
from sklearn.metrics import classification_report, precision_recall_fscore_support, roc_auc_score

In [52]:
BATCH_SIZE = 1024
EPOCHS = 300
FILE_PATH = "models/keras_model_weights.hdf5"
checkpoint = ModelCheckpoint(FILE_PATH, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=10)
callbacks_list = [checkpoint, early]

In [53]:
model.fit(X_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=[X_val, y_val],
          callbacks=callbacks_list)

Train on 23750 samples, validate on 1250 samples
Epoch 1/300
Epoch 00001: val_loss improved from inf to 0.49054, saving model to models/keras_model_weights.hdf5
Epoch 2/300
Epoch 00002: val_loss improved from 0.49054 to 0.36974, saving model to models/keras_model_weights.hdf5
Epoch 3/300
Epoch 00003: val_loss improved from 0.36974 to 0.34933, saving model to models/keras_model_weights.hdf5
Epoch 4/300
Epoch 00004: val_loss improved from 0.34933 to 0.33288, saving model to models/keras_model_weights.hdf5
Epoch 5/300
Epoch 00005: val_loss improved from 0.33288 to 0.32014, saving model to models/keras_model_weights.hdf5
Epoch 6/300
Epoch 00006: val_loss improved from 0.32014 to 0.31302, saving model to models/keras_model_weights.hdf5
Epoch 7/300
Epoch 00007: val_loss improved from 0.31302 to 0.31249, saving model to models/keras_model_weights.hdf5
Epoch 8/300
Epoch 00008: val_loss improved from 0.31249 to 0.30544, saving model to models/keras_model_weights.hdf5
Epoch 9/300
Epoch 00009: va

<keras.callbacks.History at 0x7f6633220278>

In [54]:
model = load_model(FILE_PATH)
y_hat = model.predict(X_test)
y_hat = y_hat[:,1] 

In [55]:
print(classification_report(y_test[:,1], y_hat > 0.5))

             precision    recall  f1-score   support

          0       0.89      0.86      0.88     12500
          1       0.87      0.89      0.88     12500

avg / total       0.88      0.88      0.88     25000



In [56]:
scores = precision_recall_fscore_support(y_test[:,1], y_hat > 0.5)
roc = roc_auc_score(y_test[:,1], y_hat)
print(scores, roc)

(array([ 0.89130435,  0.86691986]), array([ 0.86264,  0.8948 ]), array([ 0.87673795,  0.88063932]), array([12500, 12500])) 0.9507295104


#### Concat Multiple Conv Layers

In [20]:
EMBED_SIZE = wv_model.vector_size
N_CLASSES = y_train.shape[1]
MAX_FEATURES = embedding_weights.shape[0]
HIDDEN_LAYER_SIZE = 128
CONV_FILTER_SIZE = 128
CONV_KERNEL_SIZE = 2


def create_multiconv_model(MAX_LEN, MAX_FEATURES, EMBED_SIZE, HIDDEN_LAYER_SIZE, N_CLASSES, embedding_weights=None,
                 embedding_trainable=False, global_max_avg = 'max', learning_rate=0.0001):
    input_sequence = Input(shape=(MAX_LEN, ))
    
    x = Embedding(input_dim=MAX_FEATURES, output_dim=EMBED_SIZE, mask_zero=False, weights=[embedding_weights],
                  trainable=embedding_trainable)(input_sequence)
    x = Dropout(0.5)(x)
    c3 = Conv1D(CONV_FILTER_SIZE, 3, padding='same')(x)
    c4 = Conv1D(CONV_FILTER_SIZE, 4, padding='same')(x)
    c5 = Conv1D(CONV_FILTER_SIZE, 5, padding='same')(x)
    x = concatenate([c3, c4, c5])
    x = Conv1D(CONV_FILTER_SIZE, 3, padding='same')(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(HIDDEN_LAYER_SIZE, activation='relu')(x)
    prediction = Dense(N_CLASSES, activation='softmax')(x)
    
    opt = Adam(lr=learning_rate)
    
    model = Model(inputs=input_sequence, outputs=prediction)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    return model

In [21]:
model = create_multiconv_model(MAX_LEN, MAX_FEATURES, EMBED_SIZE, HIDDEN_LAYER_SIZE, N_CLASSES,
                     embedding_weights=embedding_weights, embedding_trainable=False, global_max_avg = 'max', 
                     learning_rate=0.001)

In [22]:
BATCH_SIZE = 1024
EPOCHS = 300
FILE_PATH = "models/keras_model_weights.hdf5"
checkpoint = ModelCheckpoint(FILE_PATH, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=10)
callbacks_list = [checkpoint, early]

In [23]:
model.fit(X_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=[X_val, y_val],
          callbacks=callbacks_list)

Train on 23750 samples, validate on 1250 samples
Epoch 1/300
Epoch 00001: val_loss improved from inf to 0.46862, saving model to models/keras_model_weights.hdf5
Epoch 2/300
Epoch 00002: val_loss improved from 0.46862 to 0.35805, saving model to models/keras_model_weights.hdf5
Epoch 3/300
Epoch 00003: val_loss improved from 0.35805 to 0.33644, saving model to models/keras_model_weights.hdf5
Epoch 4/300
Epoch 00004: val_loss improved from 0.33644 to 0.31690, saving model to models/keras_model_weights.hdf5
Epoch 5/300
Epoch 00005: val_loss improved from 0.31690 to 0.30893, saving model to models/keras_model_weights.hdf5
Epoch 6/300
Epoch 00006: val_loss improved from 0.30893 to 0.30264, saving model to models/keras_model_weights.hdf5
Epoch 7/300
Epoch 00007: val_loss did not improve
Epoch 8/300
Epoch 00008: val_loss did not improve
Epoch 9/300
Epoch 00009: val_loss did not improve
Epoch 10/300
Epoch 00010: val_loss improved from 0.30264 to 0.29678, saving model to models/keras_model_weigh

<keras.callbacks.History at 0x7f592a2d3390>

In [24]:
model = load_model(FILE_PATH)
y_hat = model.predict(X_test)
y_hat = y_hat[:,1] 

In [90]:
print(classification_report(y_test[:,1], y_hat > 0.5))

             precision    recall  f1-score   support

          0       0.91      0.87      0.89     12500
          1       0.88      0.91      0.89     12500

avg / total       0.89      0.89      0.89     25000



### Visualize Conv Activations

In [57]:
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from matplotlib.colors import rgb2hex

In [58]:
def create_get_activation_function(model, output_layer_int):
    inp = model.input
    output = model.layers[output_layer_int].output
    get_activations = K.function([inp]+ [K.learning_phase()], [output])
    return get_activations

In [59]:
act_model = load_model(filepath=FILE_PATH)

In [60]:
act_model.layers

[<keras.engine.topology.InputLayer at 0x7f6632a3b2e8>,
 <keras.layers.embeddings.Embedding at 0x7f6632a3b358>,
 <keras.layers.convolutional.Conv1D at 0x7f6632a3b550>,
 <keras.layers.pooling.GlobalMaxPooling1D at 0x7f6632a3b438>,
 <keras.layers.core.Dense at 0x7f6632a3b5f8>,
 <keras.layers.core.Dense at 0x7f6632a3b128>]

In [61]:
get_activations = create_get_activation_function(act_model, 2)

In [62]:
cmap = plt.cm.get_cmap('RdYlGn')

In [64]:
example = 10

html_string = '<p>For training example: ' + str(example) + '</p>'
text = [int2word_hash.get(x,'') for x in X_train[example]]
html_string += '<p>Example Text: ' + str(' '.join(text)) + '</p>'

for node in range(CONV_FILTER_SIZE):
    activations = get_activations([[X_train[example]], 0.])[0]
    scaled_activations = activations[0,:,node] - activations[0,:,node].min()
    scaled_activations = scaled_activations / scaled_activations.max()
    scaled_activations = pd.rolling_mean(scaled_activations, CONV_KERNEL_SIZE, min_periods=1)
    new_string = ''
    for i, t in enumerate(text):
        new_string += '<span style="background-color: ' + str(rgb2hex(cmap(scaled_activations[i]))) + '">' + t + '</span>' + ' '

    html_string += '<p>CNN Filter: ' + str(node) + '</p><p>' + new_string + '</p>'
    
    
h = HTML(html_string)
display(h)

  # This is added back by InteractiveShellApp.init_path()
