In [13]:
from keras.datasets import imdb
import numpy as np

In [14]:
# Loading the IMDB Data set
# num_words=10000 means keeping the top 10,000 most frequently occurring words in the training data
 
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [None]:
train_data[0]

In [None]:
train_labels[0]

In [None]:
max([max(sequence) for sequence in train_data])

In [15]:
# Preparing the data and making it ready to be fed into a neural network
# Encoding the integer sequences into a binary matrix

def vectorize_sequences(sequences, dimension=10000):
	# creates an all-zero matrix of shape 
	results = np.zeros((len(sequences), dimension))
	for i, sequence in enumerate(sequences):
		# sets specific indices of results[i] to 1s
		results[i, sequence] = 1
	return results

# Vectorize Training data
x_train = vectorize_sequences(train_data)

#Vectorize test data
x_test = vectorize_sequences(test_data)

# Vectorize the labels
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')


In [16]:
# The model definition

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# compiling the model
model.compile(
	optimizer='rmsprop',
	loss='binary_crossentropy',
	metrics=['accuracy']
	)
model.fit(x_train, y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [17]:
# This fairly naive approach achieves an accuracy of 88%. With state-of-the-art approaches, you should be able to get close to 95%.
results


[0.29697418256759645, 0.8825600147247314]

In [18]:
# generate predictions
model.predict(x_test)

array([[0.16538502],
       [0.9999002 ],
       [0.77290714],
       ...,
       [0.09522235],
       [0.07184319],
       [0.647887  ]], dtype=float32)