# Sentiment Classification


### Loading the dataset (5 points)

In [1]:
from keras.datasets import imdb

import numpy as np
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split ( 5 points)

In [3]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [4]:
x_train.shape

(25000, 300)

In [5]:
x_test.shape

(25000, 300)

In [6]:
y_train.shape

(25000,)

In [7]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    1,   14,   22,   16,   43,  530,
        973, 1622, 1385,   65,  458, 4468,   66, 3941,    4,  173,   36,
        256,    5,   25,  100,   43,  838,  112,   50,  670,    2,    9,
         35,  480,  284,    5,  150,    4,  172,  112,  167,    2,  336,
        385,   39,    4,  172, 4536, 1111,   17,  546,   38,   13,  447,
          4,  192,   50,   16,    6,  147, 2025,   19,   14,   22,    4,
       1920, 4613,  469,    4,   22,   71,   87,   

In [8]:
np.unique(y_train)

array([0, 1], dtype=int64)

## Build Keras Embedding Layer Model (30 points)
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras import regularizers, optimizers, losses, metrics

In [10]:
Init_model = Sequential()
Init_model.add(Embedding(vocab_size, 32, input_length=maxlen))
Init_model.add(Flatten())
Init_model.add(Dropout(0.5))
Init_model.add(Dense(250, activation='relu'))
Init_model.add(Dropout(0.5))
Init_model.add(Dense(1, activation='sigmoid'))
Init_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(Init_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 32)           320000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 9600)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               2400250   
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 2,720,501
Trainable params: 2,720,501
Non-trainable params: 0
_________________________________________________________________


In [11]:
optimized_model = Sequential()
optimized_model.add(Embedding(vocab_size, 32, input_length=maxlen))
optimized_model.add(Flatten())
optimized_model.add(Dropout(0.5))
optimized_model.add(Dense(16, kernel_regularizer=regularizers.l1(0.001),activation='relu'))
optimized_model.add(Dropout(0.5))
optimized_model.add(Dense(1, activation='sigmoid'))
optimized_model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
print(optimized_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 32)           320000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 9600)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 9600)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                153616    
_________________________________________________________________
dropout_4 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total params: 473,633
Trainable params: 473,633
Non-trainable params: 0
_________________________________________________________________
None

In [12]:
NumEpochs = 8
BatchSize = 128

In [13]:
Init_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=NumEpochs, batch_size=BatchSize, verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x1ef4e3f0ef0>

In [14]:
optimized_model.fit(x_train, y_train, epochs=NumEpochs, batch_size=BatchSize, validation_data=(x_test, y_test), verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x1ef4ed7ef60>

## Accuracy of the model  & Retrive the output of each layer in keras for a given single test sample from the trained model you built (10 Points)

### Accuracy of both models

In [15]:
scores_init_model = Init_model.evaluate(x_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores_init_model[1]*100))

Accuracy: 87.32%


In [16]:
scores_optimized_model = optimized_model.evaluate(x_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores_optimized_model[1]*100))

Accuracy: 87.37%


<font color='blue'> We could see init model is overfitting, train accuracy is huge but validation accuracy is pretty low.<br>
optimized model has reduced overfitting when we added dropout and regularization and accuracy also increased</font>

### Retrive the output of each layer in keras for a given single test sample from the trained model you built 

In [17]:
from keras import backend as k

inp = optimized_model.input
outputs = [layer.output for layer in optimized_model.layers]
functions = [k.function([inp, k.learning_phase()],[out]) for out in outputs]

In [18]:
test = np.array([x_test[11],])
layer_outs = [func([test,1.]) for func in functions]
print(layer_outs[0][0][0][0])

[-0.00694243  0.00213523  0.0001971  -0.00136693 -0.00631926 -0.00097206
  0.00363801  0.00121643 -0.0012039   0.00234943 -0.0079347   0.00384465
 -0.00143604  0.00505743 -0.00050236 -0.0036982  -0.0013103   0.00368813
 -0.00563823  0.00676969  0.00074808 -0.01156617  0.0130675  -0.00483128
 -0.0031705  -0.00050625 -0.00406987  0.00439637  0.00095135 -0.0079884
  0.00017574  0.00020696]
