# Analyzing IMDB Data in Keras

In [14]:
# Imports
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

np.random.seed(42)

## 1. Loading the data
This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment.

In [15]:
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)

(25000,)
(25000,)


## 2. Examining the data
Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.

The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative.

In [16]:
print(x_train[0])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1


## 3. One-hot encoding the output
Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1.

In [17]:
# One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])


[0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0.
 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [18]:
print("Shape of training set :{}".format(x_train.shape))

Shape of training set :(25000, 1000)


And we'll also one-hot encode the output.

In [19]:
# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

(25000, 2)
(25000, 2)


## 4. Building the  model architecture
Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting.

In [32]:
# TODO: Build the model architecture

from keras.models import Sequential
from keras.layers.core import Dense , Dropout , Activation

model = Sequential()

units = 512
model.add(Dense(units , input_dim = x_train.shape[1] ))
model.add(Activation('relu'))

model.add(Dense(256 ))
model.add(Activation('relu'))
model.add(Dropout(.5))

# for i in range(10):
#     model.add(Dense(units-10 ))
#     model.add(Activation('relu'))
#     model.add(Dropout(.2))
#     units = units-10
    
model.add(Dense(2))
model.add(Activation('sigmoid'))

# TODO: Compile the model using a loss function and an optimizer.


model.compile(optimizer=keras.optimizers.SGD(lr=0.01, nesterov=True),
             loss='binary_crossentropy' , metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_58 (Dense)             (None, 512)               512512    
_________________________________________________________________
activation_58 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 256)               131328    
_________________________________________________________________
activation_59 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_41 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_60 (Dense)             (None, 2)                 514       
_________________________________________________________________
activation_60 (Activation)   (None, 2)                 0         
Total para

# TODO: Build the model architecture

from keras.models import Sequential
from keras.layers.core import Dense , Dropout , Activation

model = Sequential()

model.add(Dense


# TODO: Compile the model using a loss function and an optimizer.


In [33]:
# TODO: Run the model. Feel free to experiment with different batch sizes and number of epochs.

model.fit(x_train ,y_train , epochs=15 ,batch_size = 100, validation_data=(x_test, y_test), verbose=2)


Train on 25000 samples, validate on 25000 samples
Epoch 1/15
 - 6s - loss: 0.6825 - acc: 0.5583 - val_loss: 0.6542 - val_acc: 0.6673
Epoch 2/15
 - 4s - loss: 0.6375 - acc: 0.6531 - val_loss: 0.6016 - val_acc: 0.7437
Epoch 3/15
 - 4s - loss: 0.5803 - acc: 0.7200 - val_loss: 0.5349 - val_acc: 0.7773
Epoch 4/15
 - 4s - loss: 0.5141 - acc: 0.7679 - val_loss: 0.4725 - val_acc: 0.8008
Epoch 5/15
 - 4s - loss: 0.4642 - acc: 0.7929 - val_loss: 0.4298 - val_acc: 0.8159
Epoch 6/15
 - 4s - loss: 0.4263 - acc: 0.8117 - val_loss: 0.4015 - val_acc: 0.8273
Epoch 7/15
 - 4s - loss: 0.4011 - acc: 0.8255 - val_loss: 0.3838 - val_acc: 0.8340
Epoch 8/15
 - 4s - loss: 0.3847 - acc: 0.8348 - val_loss: 0.3718 - val_acc: 0.8397
Epoch 9/15
 - 4s - loss: 0.3711 - acc: 0.8408 - val_loss: 0.3625 - val_acc: 0.8438
Epoch 10/15
 - 4s - loss: 0.3600 - acc: 0.8470 - val_loss: 0.3553 - val_acc: 0.8478
Epoch 11/15
 - 4s - loss: 0.3477 - acc: 0.8523 - val_loss: 0.3511 - val_acc: 0.8500
Epoch 12/15
 - 4s - loss: 0.3392 - 

<keras.callbacks.History at 0x1da9d80ccc0>

## 6. Evaluating the model
This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?

In [36]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: {:.2f} %".format(score[1]*100))

Accuracy: 85.79 %
