In [3]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.layers import Flatten
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
import numpy as np
# fix random seed for reproducibility
numpy.random.seed(1)

In [4]:
# We want to have a finite vocabulary to make sure that our word matrices are not arbitrarily small
vocabulary_size = 10000

# We also want to have a finite length of reviews and not have to process really long sentences.
max_review_length = 500

#### Load data

In [10]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocabulary_size)
print('Number of reviews', len(X_train))
print('Length of first and fifth review before padding', len(X_train[0]) ,len(X_train[4]))
print('First review', X_train[0])
print('First label', y_train[0])

[[   0    0    0 ...   19  178   32]
 [   0    0    0 ...   16  145   95]
 [   0    0    0 ...    7  129  113]
 [ 687   23    4 ...   21   64 2574]]
Number of reviews 25000
Length of first and fifth review before padding 500 500
First review [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0   

In [8]:
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
print('Length of first and fifth review after padding', len(X_train[0]) ,len(X_train[4]))

Length of first and fifth review after padding 500 500


### FEED-FORWARD NETWORKS 

In [11]:
model = Sequential()

model.add(Dense(250, activation='relu',input_dim=max_review_length))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 250)               125250    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 251       
Total params: 125,501
Trainable params: 125,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/10
 - 1s - loss: 176.8025 - accuracy: 0.5048 - val_loss: 98.0177 - val_accuracy: 0.4998
Epoch 2/10
 - 1s - loss: 49.7650 - accuracy: 0.5830 - val_loss: 52.1167 - val_accuracy: 0.5024
Epoch 3/10
 - 1s - loss: 17.7281 - accuracy: 0.6645 - val_loss: 32.8438 - val_accuracy: 0.5027
Epoch 4/10
 - 1s - loss: 7.7892 - accuracy: 0.7189 - val_loss: 21.9369 - val_accuracy: 0.5012
Epoch 5/10
 - 1s - loss: 3.8072 - accuracy: 0.7672 - val_loss: 16.1270 - va

In [16]:
embedding_dim = 100

In [14]:
model = Sequential()

# inputs will be converted from batch_size * sentence_length to batch_size*sentence_length*embedding _dim
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_review_length))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               12500250  
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 251       
Total params: 13,500,501
Trainable params: 13,500,501
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
# fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)

# evaluate the model on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 52s - loss: 0.4555 - accuracy: 0.7728 - val_loss: 0.2868 - val_accuracy: 0.8772
Epoch 2/2
 - 50s - loss: 0.1172 - accuracy: 0.9584 - val_loss: 0.3433 - val_accuracy: 0.8594
Accuracy: 85.94%


### CNN

In [18]:
# %load sol2.py
# create the CNN
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_review_length))
model.add(Conv1D(filters=200, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# train the CNN
model.fit(X_train, y_train, epochs=2, batch_size=128)

# evalute the CNN
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 200)          60200     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 200)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               12500250  
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 251       
Total params: 13,560,701
Trainable params: 13,560,701
Non-trainable params: 0
__________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2

KeyboardInterrupt: 

### RNN

In [20]:
# %load sol3.py
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_review_length))
model.add(SimpleRNN(100))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, epochs=20, batch_size=16)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 1,020,201
Trainable params: 1,020,201
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 71.55%


### LSTM

In [22]:
# %load sol4.py
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, epochs=3, batch_size=64)

# evaluation of the LSTM's performance
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 101       
Total params: 1,080,501
Trainable params: 1,080,501
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
 2368/25000 [=>............................] - ETA: 3:34 - loss: 0.7128 - accuracy: 0.5752

KeyboardInterrupt: 

### CNN and LSTM

In [23]:
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 100)          1000000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 500, 32)           9632      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 101       
Total params: 1,062,933
Trainable params: 1,062,933
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 87.59%


In [26]:
from __future__ import print_function
from keras.models import Sequential
from keras import layers
from keras.layers import Dense, RepeatVector, TimeDistributed
import numpy as np
from six.moves import range

#### The less interesting data generation and preprocessing

In [31]:
class CharacterTable(object):
    def __init__(self, chars):        
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    # converts a String of characters into a one-hot embedding/vector
    def encode(self, C, num_rows):        
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x
    
    # converts a one-hot embedding/vector into a String of characters
    def decode(self, x, calc_argmax=True):        
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)


In [32]:
TRAINING_SIZE = 50000
DIGITS = 3
MAXOUTPUTLEN = DIGITS + 1
MAXLEN = DIGITS + 1 + DIGITS

chars = '0123456789+ '
ctable = CharacterTable(chars)

In [33]:
def return_random_digit():
  return np.random.choice(list('0123456789'))  

# generate a new number of length `DIGITS`
def generate_number():
  num_digits = np.random.randint(1, DIGITS + 1)  
  return int(''.join( return_random_digit()
                      for i in range(num_digits)))

# generate `TRAINING_SIZE` # of pairs of random numbers
def data_generate(num_examples):
  questions = []
  answers = []
  seen = set()
  print('Generating data...')
  while len(questions) < TRAINING_SIZE:      
      a, b = generate_number(), generate_number()
        
      # don't allow duplicates; this is good practice for training,
      # as we will minimize memorizing seen examples
      key = tuple(sorted((a, b)))
      if key in seen:
          continue
      seen.add(key)
    
      # pad the data with spaces so that the length is always MAXLEN.
      q = '{}+{}'.format(a, b)
      query = q + ' ' * (MAXLEN - len(q))
      ans = str(a + b)
    
      # answers can be of maximum size DIGITS + 1.
      ans += ' ' * (MAXOUTPUTLEN - len(ans))
      questions.append(query)
      answers.append(ans)
  print('Total addition questions:', len(questions))
  return questions, answers

def encode_examples(questions, answers):
  x = np.zeros((len(questions), MAXLEN, len(chars)), dtype=np.bool)
  y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=np.bool)
  for i, sentence in enumerate(questions):
      x[i] = ctable.encode(sentence, MAXLEN)
  for i, sentence in enumerate(answers):
      y[i] = ctable.encode(sentence, DIGITS + 1)

  indices = np.arange(len(y))
  np.random.shuffle(indices)
  return x[indices],y[indices]

In [34]:
q,a = data_generate(TRAINING_SIZE)
x,y = encode_examples(q,a)

# divides our data into training and validation
split_at = len(x) - len(x) // 10
x_train, x_val, y_train, y_val = x[:split_at], x[split_at:],y[:split_at],y[split_at:]

print('Training Data shape:')
print('X : ', x_train.shape)
print('Y : ', y_train.shape)

print('Sample Question(in encoded form) : ', x_train[0], y_train[0])
print('Sample Question(in decoded form) : ', ctable.decode(x_train[0]),'Sample Output : ', ctable.decode(y_train[0]))

Generating data...
Total addition questions: 50000
Training Data shape:
X :  (45000, 7, 12)
Y :  (45000, 4, 12)
Sample Question(in encoded form) :  [[False False False False  True False False False False False False False]
 [False False False  True False False False False False False False False]
 [False False False False False False False  True False False False False]
 [False  True False False False False False False False False False False]
 [False False False False False False False False False False  True False]
 [False False False False False False  True False False False False False]
 [ True False False False False False False False False False False False]] [[False False False False  True False False False False False False False]
 [False False False False False False False False False False False  True]
 [False False False False False False False False False False False  True]
 [ True False False False False False False False False False False False]]
Sample Question(in decode

#### Let's learn two wrapper functions in Keras - TimeDistributed and RepeatVector with some dummy examples.

**TimeDistributed** is a wrapper function call that applies an input operation on all the timesteps of an input data. For instance, if I have a feed-forward network which converts a 10-dim vector to a 5-dim vector, then wrapping this TimeDistributed layer on that feed-forward operation would convert a batch_size  \* sentence_len \* vector_len(=10) to batch_size  \* sentence_len \*  output_len(=5)

In [35]:
model = Sequential()
#Inputs to it will be batch_size*time_steps*input_vector_dim(to Dense)
# Output will be batch_size*time_steps* output_vector_dim
# Here, Dense() converts a 5-dim input vector to a 8-dim vector.
model.add(TimeDistributed(Dense(8), input_shape=(3, 5)))
input_array = np.random.randint(10, size=(1,3,5))
print("Shape of input : ", input_array.shape)

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
print("Shape of output : ", output_array.shape)

Shape of input :  (1, 3, 5)
Shape of output :  (1, 3, 8)


**RepeatVector** repeats the vector a specified number of times. Dimension changes from batch_size * number of elements to batch_size* number of repetitions * number of elements.

In [None]:
model = Sequential()
# converts from 1*10 to 1*6
model.add(Dense(6, input_dim=10))
print(model.output_shape)

# converts from 1*6 to 1*3*6
model.add(RepeatVector(3))
print(model.output_shape) 

input_array = np.random.randint(1000, size=(1, 10))
print("Shape of input : ", input_array.shape)

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)

print("Shape of output : ", output_array.shape)
# note: `None` is the batch dimension
print('Input : ', input_array[0])
print('Output : ', output_array[0])

### MODEL ARCHITECTURE

<img src="files/fig/LSTM_addition.jpg" width="400">

**Note:** Whenever you are initializing a LSTM in Keras, by the default the option `return_sequences = False`. This means that at the end of the step the next component will only get to see the final hidden layer's values. On the other hand, if you set `return_sequences = True`, the LSTM component will return the hidden layer at each time step. It means that the next component should be able to consume inputs in that form. 

Think how this statement is relevant in terms of this model architecture and the TimeDistributed module we just learned.

Build an encoder and decoder both single layer 128 nodes and an appropriate dense layer as needed by the model.

In [37]:
# Hyperaparams
RNN = layers.LSTM
HIDDEN_SIZE = 128
BATCH_SIZE = 128
LAYERS = 1

print('Build model...')
model = Sequential()

#ENCODING
model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(chars))))
model.add(RepeatVector(MAXOUTPUTLEN))

#DECODING
for _ in range(LAYERS):
    # return hidden layer at each time step
    model.add(RNN(HIDDEN_SIZE, return_sequences=True)) 

model.add(TimeDistributed(layers.Dense(len(chars), activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Build model...
Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 128)               72192     
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 4, 128)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 4, 128)            131584    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 4, 12)             1548      
Total params: 205,324
Trainable params: 205,324
Non-trainable params: 0
_________________________________________________________________


Let's check how well our model trained.

In [38]:
for iteration in range(1, 2):
    print()  
    model.fit(x_train, y_train,
              batch_size=BATCH_SIZE,
              epochs=20,
              validation_data=(x_val, y_val))
    # Select 10 samples from the validation set at random so
    # we can visualize errors.
    print('Finished iteration ', iteration)
    numcorrect = 0
    numtotal = 20
    
    for i in range(numtotal):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print('Question', q, end=' ')
        print('True', correct, end=' ')
        print('Guess', guess, end=' ')
        if guess == correct :
          print('Good job')
          numcorrect += 1
        else:
          print('Fail')
    print('The model scored ', numcorrect*100/numtotal,' % in its test.')


Train on 45000 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Finished iteration  1
Question 551+581 True 1132 Guess 1132 Good job
Question 332+746 True 1078 Guess 1078 Good job
Question 736+654 True 1390 Guess 1490 Fail
Question 77+705  True 782  Guess 782  Good job
Question 7+266   True 273  Guess 273  Good job
Question 18+7    True 25   Guess 25   Good job
Question 123+666 True 789  Guess 889  Fail
Question 886+48  True 934  Guess 934  Good job
Question 6+862   True 868  Guess 868  Good job
Question 492+378 True 870  Guess 870  Good job
Question 59+30   True 89   Guess 89   Good job
Question 542+95  True 637  Guess 637  Good job
Question 58+738  True 796  Guess 796  Good job
Question 29+62   True 91   Guess 91   Good job
Question 217+21  True 238  Guess 238  Good job
