# Sentiment Classification


## Loading the dataset

In [1]:
from keras.datasets import imdb

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [2]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split

In [3]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [4]:
import numpy as np

movies_data = np.concatenate((x_train, x_test), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

print("Categories:", np.unique(targets))
print("Number of unique words:", len(np.unique(np.hstack(movies_data))))

Categories: [0 1]
Number of unique words: 9999


In [11]:
print("Label:", targets[0])
print(movies_data[0])
index = imdb.get_word_index()
ind_reversed = dict([(value, key) for (key, value) in index.items()]) 
review = " ".join( [ind_reversed.get(i - 3, "") for i in movies_data[0]] )
print(review.strip()) 

Label: 1
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    1   14
   22   16   43  530  973 1622 1385   65  458 4468   66 3941    4  173
   36  256    5   25  100   43  838  112   50  670    2    9   35  480
  284    5  150    4  172  112  167    2  336  385   39    4  172 4536
 1111   17  546   38   13  447    4  192   50   16    6  147 2025   19
   14   22    4 1920 4613  469    4   22   71   87   12   16   43  530
   38   76   15   13 1247    4   22   17  515   17   12   16  626   18
    2    5   62  386   12    8  316    8  106    5    4 2223 5244   16
  480   66 3785   33    4  130   12   16   38  619    5   25  124   

## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [12]:
from keras import layers
from keras import models
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten

model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=maxlen))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 32)           320000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               2400250   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 2,720,501
Trainable params: 2,720,501
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Compile the model
model.compile(
 optimizer = "adam",
 loss = "binary_crossentropy",
 metrics = ["accuracy"]
)

In [14]:
#Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=128, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 4s - loss: 0.4470 - accuracy: 0.7655 - val_loss: 0.2995 - val_accuracy: 0.8701
Epoch 2/2
 - 2s - loss: 0.1354 - accuracy: 0.9519 - val_loss: 0.3427 - val_accuracy: 0.8632


<keras.callbacks.callbacks.History at 0x7f8edcd5f6d8>

In [15]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.32%


## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [16]:
from keras import backend as K

inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functors = [K.function([inp], [out]) for out in outputs]    # evaluation functions

In [17]:
# Testing
test = np.random.random(maxlen)[np.newaxis,...]
layer_outs = [func([test]) for func in functors]
print(layer_outs)

[[array([[[ 0.0025931 , -0.00346883, -0.00214778, ..., -0.0001134 ,
         -0.00250848,  0.00196609],
        [ 0.0025931 , -0.00346883, -0.00214778, ..., -0.0001134 ,
         -0.00250848,  0.00196609],
        [ 0.0025931 , -0.00346883, -0.00214778, ..., -0.0001134 ,
         -0.00250848,  0.00196609],
        ...,
        [ 0.0025931 , -0.00346883, -0.00214778, ..., -0.0001134 ,
         -0.00250848,  0.00196609],
        [ 0.0025931 , -0.00346883, -0.00214778, ..., -0.0001134 ,
         -0.00250848,  0.00196609],
        [ 0.0025931 , -0.00346883, -0.00214778, ..., -0.0001134 ,
         -0.00250848,  0.00196609]]], dtype=float32)], [array([[ 0.0025931 , -0.00346883, -0.00214778, ..., -0.0001134 ,
        -0.00250848,  0.00196609]], dtype=float32)], [array([[0.09482387, 0.        , 0.1511667 , 0.15032326, 0.1448444 ,
        0.        , 0.04980447, 0.15447068, 0.        , 0.06558153,
        0.03426921, 0.09843654, 0.        , 0.08733436, 0.        ,
        0.09648407, 0.00824288

In [27]:
predicted = model.predict_classes(x_test)
for i in range(5):
	print("Y=%s, Predicted=%s" % (y_test[i], predicted[i]))
 
print(x_train.shape)

x_test_review_0 = " ".join( [ind_reversed.get(i - 3, "") for i in movies_data[25000]] )
print(x_test_review_0.strip()) 

x_test_review_1 = " ".join( [ind_reversed.get(i - 3, "") for i in movies_data[25001]] )
print(x_test_review_1.strip()) 

x_test_review_2 = " ".join( [ind_reversed.get(i - 3, "") for i in movies_data[25002]] )
print(x_test_review_2.strip()) 

x_test_review_3 = " ".join( [ind_reversed.get(i - 3, "") for i in movies_data[25003]] )
print(x_test_review_3.strip()) 

x_test_review_4 = " ".join( [ind_reversed.get(i - 3, "") for i in movies_data[25004]] )
print(x_test_review_4.strip()) 

Y=0, Predicted=[0]
Y=1, Predicted=[1]
Y=1, Predicted=[1]
Y=0, Predicted=[0]
Y=1, Predicted=[1]
(25000, 300)
please give this one a miss br br   and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite  so all you madison fans give this a miss
this film requires a lot of patience because it focuses on mood and character development the plot is very simple and many of the scenes take place on the same set in frances  the sandy dennis character apartment but the film builds to a disturbing climax br br the characters create an atmosphere  with sexual tension and psychological  it's very interesting that robert altman directed this considering the style and structure of his other films still the trademark altman audio style is evident here and there i think what really makes this film work is the brill

As we can see model is able to predict sentiment correctly.