# Analyzing IMDB Data in Keras

In [1]:
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(89)

Using TensorFlow backend.


## Loading the data
This dataset comes preloaded with Keras.

In [2]:
# Load the data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
(25000,)
(25000,)


## One-hot encoding

In [4]:
# One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])

[0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0.
 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [5]:
# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

(25000, 2)
(25000, 2)


## Build the model architecture

In [6]:
# Build the model
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=1000))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

# Compile the model using categorical_crossentropy loss, and RMSProp optimizer.
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               512512    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 1026      
Total params: 513,538
Trainable params: 513,538
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [7]:
# Run the model usiing batch_size of 32 and number of epochs of 10
hist = model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_test, y_test), 
          verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
 - 5s - loss: 0.3987 - accuracy: 0.8264 - val_loss: 0.3334 - val_accuracy: 0.8602
Epoch 2/10
 - 3s - loss: 0.3332 - accuracy: 0.8666 - val_loss: 0.3552 - val_accuracy: 0.8545
Epoch 3/10
 - 3s - loss: 0.3218 - accuracy: 0.8768 - val_loss: 0.3691 - val_accuracy: 0.8594
Epoch 4/10
 - 3s - loss: 0.3151 - accuracy: 0.8825 - val_loss: 0.3703 - val_accuracy: 0.8596
Epoch 5/10
 - 3s - loss: 0.3087 - accuracy: 0.8899 - val_loss: 0.3781 - val_accuracy: 0.8613
Epoch 6/10
 - 3s - loss: 0.2946 - accuracy: 0.8962 - val_loss: 0.4145 - val_accuracy: 0.8541
Epoch 7/10
 - 3s - loss: 0.2926 - accuracy: 0.9020 - val_loss: 0.4173 - val_accuracy: 0.8607
Epoch 8/10
 - 3s - loss: 0.2774 - accuracy: 0.9087 - val_loss: 0.4444 - val_accuracy: 0.8609
Epoch 9/10
 - 3s - loss: 0.2716 - accuracy: 0.9122 - val_loss: 0.4535 - val_accuracy: 0.8592
Epoch 10/10
 - 3s - loss: 0.2568 - accuracy: 0.9186 - val_loss: 0.4826 - val_accuracy: 0.8590


## Evaluating the model

In [8]:
score = model.evaluate(x_test, y_test, verbose=0)
# Print the accuracy
print("Accuracy: ", score[1])

Accuracy:  0.859000027179718
