# Analyzing IMDB Data in Keras

In [28]:
# Imports
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)

## 1. Loading the data
This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment.

In [109]:
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(25000,)
(25000,)
(25000,)
(25000,)


In [110]:
print(x_train.shape[0])
print(x_test.shape[0])
print(y_train.shape[0])
print(y_test.shape[0])

25000
25000
25000
25000


## 2. Examining the data
Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.

The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative.

In [111]:
print(x_train[0])
print(y_train[0], y_train[1], y_train[2])
print (len(x_train[0]), len(x_train[1]), len(x_train[2]) )
#print (len(y_train[0]), len(y_train[1]), len(y_train[2]) )


[1, 35, 321, 22, 2, 4, 2, 2, 11, 4, 456, 7, 6, 2, 2, 2, 7, 24, 38, 2, 84, 11, 4, 2, 2, 2, 2, 940, 5, 487, 367, 19, 478, 116, 262, 4, 354, 34, 2, 2, 5, 2, 2, 2, 14, 22, 16, 93, 11, 4, 172, 291, 17, 4, 545, 2, 2, 398, 5, 526, 34, 723, 2, 4, 2, 7, 4, 206, 203, 24, 30, 17, 2, 17, 15, 11, 2, 2, 2, 5, 4, 105, 203, 306, 8, 30, 128, 2, 19, 257, 85, 11, 972, 39, 15, 21, 4, 965, 2, 7, 4, 105, 26, 17, 2, 5, 2, 5, 68, 537, 17, 2, 17, 101, 7, 148, 11, 2, 18, 148, 797, 37, 2, 2, 14, 9, 6, 215, 67, 22, 82, 451, 7, 2, 2, 5, 308, 2, 80, 40, 14, 20]
1 1 0
142 171 193


In [112]:
# from keras.preprocessing import sequence

# # Set the maximum number of words per document (for both training and testing)
# max_words = 500

# # TODO: Pad sequences in X_train and X_test
# x_train = sequence.pad_sequences(x_train, maxlen=max_words)
# x_test = sequence.pad_sequences(x_test, maxlen=max_words)
# print(x_train.shape)
# print(x_test.shape)
# print(y_train.shape)
# print(y_test.shape)

## 3. One-hot encoding the output
Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1.

In [113]:
# One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train[0])

[ 0.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  1.  0.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  0.  1.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  1.
  0.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  1.  1.  0.  0.  0.  0.
  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

And we'll also one-hot encode the output.

In [114]:
# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print(y_train[0], y_train[1], y_train[2])

(25000, 1000)
(25000, 1000)
(25000, 2)
(25000, 2)
[ 0.  1.] [ 0.  1.] [ 1.  0.]


## 4. Building the  model architecture
Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting.

In [115]:
# TODO: Build the model architecture
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.optimizers import Adam

# TODO: Design your model
embeding_size =32
model = Sequential()
#model.add(Embedding(1000, embeding_size, input_length=1000))
model.add(Dense(25, input_shape=x_train.shape[1:], activation='relu') )
model.add(Dense(25 , activation='relu') )
model.add(Dense(2,activation='softmax'))
print(model.summary())

# TODO: Compile the model using a loss function and an optimizer.
learning_rate = 1e-3
model.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate), metrics = ['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_41 (Dense)             (None, 25)                25025     
_________________________________________________________________
dense_42 (Dense)             (None, 25)                650       
_________________________________________________________________
dense_43 (Dense)             (None, 2)                 52        
Total params: 25,727
Trainable params: 25,727
Non-trainable params: 0
_________________________________________________________________
None


## 5. Training the model
Run the model here. Experiment with different batch_size, and number of epochs!

In [132]:
# TODO: Run the model. Feel free to experiment with different batch sizes and number of epochs.
batch_size = 64
num_epochs = 5
print (x_train.shape, y_train.shape)
x_valid, y_valid = x_train[:batch_size], y_train[:batch_size]
x_train2,y_train2 =x_train[batch_size:], y_train[batch_size:]
print (x_train2.shape, y_train2.shape,x_valid.shape, y_valid.shape )
model.fit(x_train2, y_train2, validation_data=(x_valid, y_valid), batch_size = batch_size, epochs=num_epochs)

(25000, 1000) (25000, 2)
(24936, 1000) (24936, 2) (64, 1000) (64, 2)
Train on 24936 samples, validate on 64 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb56e2eb828>

## 6. Evaluating the model
This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?

In [133]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: ", score[1])

Accuracy:  0.825
