# Sentiment Classification


## Loading the dataset

In [1]:
from keras.datasets import imdb

vocab_size = 10000 #vocab size

Using TensorFlow backend.


In [2]:
import numpy as np
from functools import partial

# save np.load
#old = np.load
old = partial(np.load)
# modify the default parameters of np.load
np.load = lambda *a,**k: old(allow_pickle=True,*a,**k)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.
np.load = old


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [0]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 20  #number of word used from each review

## Train test split

In [0]:
#load dataset as a list of ints
#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

Lets check X_train and y_train data

In [5]:
x_train[0]

array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32], dtype=int32)

In [6]:
y_train[0]

1

# Get the word index and then Create a key-value pair for word and word_id

In [7]:
word_to_id=imdb.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [13]:
word_to_id

{u'fawn': 34701,
 u'tsukino': 52006,
 u'nunnery': 52007,
 u'sonja': 16816,
 u'vani': 63951,
 u'woods': 1408,
 u'spiders': 16115,
 u'hanging': 2345,
 u'woody': 2289,
 u'trawling': 52008,
 u"hold's": 52009,
 u'comically': 11307,
 u'localized': 40830,
 u'disobeying': 30568,
 u"'royale": 52010,
 u"harpo's": 40831,
 u'canet': 52011,
 u'aileen': 19313,
 u'acurately': 52012,
 u"diplomat's": 52013,
 u'rickman': 25242,
 u'rumbustious': 52014,
 u'familiarness': 52015,
 u"spider'": 52016,
 u'hahahah': 68804,
 u"wood'": 52017,
 u'transvestism': 40833,
 u"hangin'": 34702,
 u'screaming': 1927,
 u'seamier': 40834,
 u'wooded': 34703,
 u'bravora': 52018,
 u'grueling': 16817,
 u'wooden': 1636,
 u'wednesday': 16818,
 u"'prix": 52019,
 u'altagracia': 34704,
 u'circuitry': 52020,
 u'crotch': 11585,
 u'busybody': 57766,
 u"tart'n'tangy": 52021,
 u'pantheistic': 52022,
 u'thrace': 52023,
 u"tom's": 11038,
 u'snuggles': 52025,
 u"frasier's": 52026,
 u'complainers': 52027,
 u'templarios': 52125,
 u'272': 40835

In [0]:
id_to_word={value:key for key , value in word_to_id.items()}

In [18]:
id_to_word[9097]

u'snatch'

## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [0]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model 
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [20]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

W0215 12:15:32.608918 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0215 12:15:32.619138 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0215 12:15:32.627859 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0215 12:15:32.653873 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0215 12:15:32.670579 139809852520320 module_wrapper.py:139] From /usr/local

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            500000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1001      
Total params: 501,001
Trainable params: 501,001
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
# fit the model
model.fit(x_train, y_train, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

W0215 12:17:17.732223 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0215 12:17:17.795504 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:973: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.

W0215 12:17:17.817591 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:2741: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0215 12:17:17.822990 139809852520320 module_wrapper.py:139] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0215 12:17:17.823956 139809852520320 module_wrapper.py:139] From /usr/local/li

Accuracy: 72.016000


## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [0]:
from keras import backend as K

# with a Sequential model
get_first_layer_output = K.function([model.layers[0].input],
                                  [model.layers[0].output])
layer_output = get_first_layer_output([x_train])[0]

In [26]:
layer_output

array([[[ 0.6509492 ,  0.22907811,  0.0316095 , ..., -0.17147371,
         -0.1449931 ,  0.23516747],
        [-0.11961807,  0.04291075, -0.00316352, ..., -0.14503194,
          0.14754272,  0.35003486],
        [ 0.30882892,  0.10604101, -0.04369019, ...,  0.28334188,
         -0.01279384, -0.14620961],
        ...,
        [ 0.07816771,  0.05036458, -0.02341873, ..., -0.09876047,
         -0.04911187, -0.11416675],
        [ 0.26770008,  0.07940064, -0.23177004, ..., -0.3359184 ,
          0.07509322,  0.29035595],
        [ 0.21786287,  0.2187307 , -0.04495793, ..., -0.1753574 ,
         -0.23102003, -0.32779476]],

       [[ 0.00638818, -0.01570698, -0.03227109, ..., -0.06457037,
          0.05568272,  0.13160989],
        [ 0.05277974, -0.02564324,  0.00352069, ...,  0.02032801,
         -0.01951245,  0.01196554],
        [-0.22398353,  0.25401175,  0.1319215 , ..., -0.18696077,
         -0.02116876,  0.41942525],
        ...,
        [-0.11961807,  0.04291075, -0.00316352, ..., -

In [0]:
get_2nd_layer_output = K.function([model.layers[1].input],
                                  [model.layers[1].output])
layer_output_2 = get_2nd_layer_output([x_train])[0]

In [31]:
layer_output_2

array([[6.500e+01, 1.600e+01, 3.800e+01, ..., 1.900e+01, 1.780e+02,
        3.200e+01],
       [2.300e+01, 4.000e+00, 1.690e+03, ..., 1.600e+01, 1.450e+02,
        9.500e+01],
       [1.352e+03, 1.300e+01, 1.910e+02, ..., 7.000e+00, 1.290e+02,
        1.130e+02],
       ...,
       [1.100e+01, 1.818e+03, 7.561e+03, ..., 4.000e+00, 3.586e+03,
        2.000e+00],
       [9.200e+01, 4.010e+02, 7.280e+02, ..., 1.200e+01, 9.000e+00,
        2.300e+01],
       [7.640e+02, 4.000e+01, 4.000e+00, ..., 2.040e+02, 1.310e+02,
        9.000e+00]], dtype=float32)