In [1]:
# Classify movie reviews in the IMDB dataset.

In [2]:
import itertools
import numpy as np
import tensorflow as tf
imdb = tf.keras.datasets.imdb


In [3]:
# import dataset of reviews and their labels
# limit review text to top 10000 commonly occuring words
imdb = tf.keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = 10000)
min_index = min([min(s) for s in train_data])
max_index = max([max(s) for s in train_data])

In [4]:
print('summary of train_data')
print('---------------------')
print('type: ' + str(type(train_data)))
print('shape: ' + str(train_data.shape))
print('type of train_data[0]: ' + str(type(train_data[0])))
print('length of train_data[:5]: ' + str([len(train_data[i]) for i in range(5)]))
print('type of train_data[0][0]: ' + str(type(train_data[0][0])))
print('train_data[0][:10]: ' + str(train_data[0][:10]))
print('minimum entry in train_data: ' + str(min_index))
print('maximum entry in train_data: ' + str(max_index))
print()
print('summary of train_labels')
print('-----------------------')
print('type: ' + str(type(train_labels)))
print('shape: ' + str(train_labels.shape))
print('type of train_labels[0]: ' + str(type(train_labels[0])))
print('train_labels[:10]: ' + str(train_labels[:10]))
print('number of negative reviews: ' + str(np.sum(train_labels == 0)))
print('number of positive reviews: ' + str(np.sum(train_labels == 1)))
print()
print('summary of test_data')
print('--------------------')
print('shape: ' + str(test_data.shape))
print()
print('summary of test_labels')
print('----------------------')
print('shape: ' + str(test_labels.shape))
print('number of negative reviews: ' + str(np.sum(test_labels == 0)))
print('number of positive reviews: ' + str(np.sum(test_labels == 1)))

summary of train_data
---------------------
type: <class 'numpy.ndarray'>
shape: (25000,)
type of train_data[0]: <class 'list'>
length of train_data[:5]: [218, 189, 141, 550, 147]
type of train_data[0][0]: <class 'int'>
train_data[0][:10]: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]
minimum entry in train_data: 1
maximum entry in train_data: 9999

summary of train_labels
-----------------------
type: <class 'numpy.ndarray'>
shape: (25000,)
type of train_labels[0]: <class 'numpy.int64'>
train_labels[:10]: [1 0 0 1 0 0 1 0 1 0]
number of negative reviews: 12500
number of positive reviews: 12500

summary of test_data
--------------------
shape: (25000,)

summary of test_labels
----------------------
shape: (25000,)
number of negative reviews: 12500
number of positive reviews: 12500


In [5]:
# get word to index mapping and create reverse mapping from index to word
word_index = imdb.get_word_index()
reverse_word_index = { value:key for key,value in word_index.items() }

# helper function to decode index encoded reviews
def decode_review( encoded_review ) : 
    return ' '.join([reverse_word_index.get(i-3, '?') for i in encoded_review])

In [6]:
print()
print('number of entries word to index mapping dictionary: ' + str(len(word_index)))
print('number of entries index to word mapping dictionary: ' + str(len(reverse_word_index)))
print('low index values are common words: ' + str([reverse_word_index[i] for i in range(1,10)]))
print()
print('example of a positive review:')
print('-----------------------------')
positive_index = 6
print(decode_review(train_data[positive_index]))
print()
print('example of a negative review:')
print('-----------------------------')
negative_index = 2
print(decode_review(train_data[negative_index]))


number of entries word to index mapping dictionary: 88584
number of entries index to word mapping dictionary: 88584
low index values are common words: ['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it']

example of a positive review:
-----------------------------
? lavish production values and solid performances in this straightforward adaption of jane ? satirical classic about the marriage game within and between the classes in ? 18th century england northam and paltrow are a ? mixture as friends who must pass through ? and lies to discover that they love each other good humor is a ? virtue which goes a long way towards explaining the ? of the aged source material which has been toned down a bit in its harsh ? i liked the look of the film and how shots were set up and i thought it didn't rely too much on ? of head shots like most other films of the 80s and 90s do very good results

example of a negative review:
-----------------------------
? this has to be one of the worst films

In [7]:
# create a histogram of frequency counts of each index
import itertools
flatlist = list(itertools.chain.from_iterable(train_data))  # concatinate reviews
freq = np.bincount(flatlist)/len(flatlist)  # count occurance of each word

In [8]:
# create list of bigrams from flat list
bigramlist = list(zip(flatlist[:-1],flatlist[1:]))

In [9]:
# binary encoding of reviews
def vectorize_sequences( sequences, dimension = 10000 ):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i,sequence] = 1
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [10]:
# vectorize labels -- don't know why this is necessary (int64-->float32?)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [None]:
models = tf.keras.models
layers = tf.keras.layers

def build_model1():
    