# Analysing IMDB Data using Keras

In [11]:
# Imports
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)

## Loading the Dataset

In [12]:
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)

(25000,)
(25000,)


## One-Hot Encoding
Here, we'll turn the input vectors into (0,1)-vectors

In [13]:
#One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
#print(x_train[0])

In [15]:
x_train.shape

(25000, 1000)

In [17]:
# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)

(25000, 2)
(25000, 2)


## Model:Architecture

In [19]:
model = Sequential()
model.add(Dense(512, activation = 'relu', input_dim = 1000))
model.add(Dropout(0.5))
model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 512)               512512    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 514       
Total params: 644,354
Trainable params: 644,354
Non-trainable params: 0
_________________________________________________________________


## Training the Model

In [20]:
hist = model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_test, y_test), 
          verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
 - 11s - loss: 0.4120 - acc: 0.8135 - val_loss: 0.3833 - val_acc: 0.8177
Epoch 2/10
 - 10s - loss: 0.3259 - acc: 0.8582 - val_loss: 0.3210 - val_acc: 0.8625
Epoch 3/10
 - 9s - loss: 0.2853 - acc: 0.8793 - val_loss: 0.3413 - val_acc: 0.8550
Epoch 4/10
 - 10s - loss: 0.2495 - acc: 0.8978 - val_loss: 0.3339 - val_acc: 0.8582
Epoch 5/10
 - 9s - loss: 0.1976 - acc: 0.9217 - val_loss: 0.3699 - val_acc: 0.8570
Epoch 6/10
 - 9s - loss: 0.1515 - acc: 0.9402 - val_loss: 0.4268 - val_acc: 0.8538
Epoch 7/10
 - 9s - loss: 0.1167 - acc: 0.9560 - val_loss: 0.4593 - val_acc: 0.8498
Epoch 8/10
 - 9s - loss: 0.0917 - acc: 0.9662 - val_loss: 0.5130 - val_acc: 0.8499
Epoch 9/10
 - 9s - loss: 0.0768 - acc: 0.9724 - val_loss: 0.5454 - val_acc: 0.8524
Epoch 10/10
 - 9s - loss: 0.0729 - acc: 0.9744 - val_loss: 0.5362 - val_acc: 0.8504


## Evaluating the model

In [21]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: ", score[1])

Accuracy:  0.85044
