### Load pre-processed data for training

In [1]:
import pickle
import numpy as np
from keras.models import Model
from keras.layers import Input, Embedding, CuDNNLSTM, Dense, TimeDistributed

Using TensorFlow backend.


In [2]:
# loading X and Y
with open('x_and_y.pkl', 'rb') as f:
    X_encoder, X_decoder, y = pickle.load(f)

In [3]:
# loading vocab_embeddings
with open('embedding_weights.pkl', 'rb') as f:
    embedding_weights = pickle.load(f)

#### Now the structure of our model will be like this:
##### 1. Encoder LSTM will take the input (Embedded) and after processing, it will pass on the cell state to the decoder
##### 2. Decoder LSTM will take initial state from cell state of encoder and inputs will be the Expected output just 1 word behind so basically the decoder lstm will predict the next word in the output sequence
##### 3. A dense layer i.e a regular Feedforward NN will then predict the words occuring in the expected sentence in one-hot encoded form i.e the labels will be "1" where the word from the vocabulary is present in the given sentence and the predicted output will be in softmax probability form so basically prediction by -ve log loss

### Building the Seq2Seq Model

In [4]:
max_len = 20
vocab_size = 15000
embedding_dim = 300
hidden_dim = 300
# obtained in vocab_embedding
number_of_samples = 221616
# train : val = 93.75 %
number_of_train_samples = 207765
number_of_val_samples = 13851

In [5]:
# Embedding Layer
embedding_layer = Embedding(
    input_dim=vocab_size, 
    output_dim=embedding_dim,
    input_length=max_len,
    weights=[embedding_weights],
    trainable=False
)

In [6]:
# Encoder
encoder_inputs = Input(shape=(max_len,), dtype='int32')
encoder_embedding = embedding_layer(encoder_inputs)
encoder_LSTM = CuDNNLSTM(hidden_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
# Decoder
decoder_inputs = Input(shape=(max_len,), dtype='int32')
decoder_embedding = embedding_layer(decoder_inputs)
decoder_LSTM = CuDNNLSTM(hidden_dim, return_state=True, return_sequences=True)
decoder_outputs, _, _ = decoder_LSTM(decoder_embedding, initial_state=[state_h, state_c])

In [8]:
# Output
outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)

In [9]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 300)      4500000     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)        [(None, 300), (None, 722400      embedding_1[0][0]                
__________

In [10]:
model.compile(optimizer='adam', loss ='categorical_crossentropy', metrics = ['accuracy'])

### Training the Model

#### Preparing train and val generator for training in batches

##### Making Y in one-hot encoded form first
##### output is (number of sequences, max_len, vocab_size) i.e for each sentence at each position in maxlen what is the one hot encoding of the word present?

In [11]:
def train_generator(batch_size):
    number_of_batches = int(number_of_train_samples / batch_size)
    Y = np.zeros(shape=(batch_size, max_len, vocab_size), dtype="float32")
    while True:
        for count in range(number_of_batches):
            for i, sequences in enumerate(y[count * batch_size: count * batch_size + batch_size]):
                for j, sequence in enumerate(sequences):
                    Y[i][j][sequence] = 1
            yield ([
                X_encoder[count * batch_size: count * batch_size + batch_size],
                X_decoder[count * batch_size: count * batch_size + batch_size]], 
                Y
            )
            Y.fill(0)

In [12]:
def validation_generator(val_size):
    validation_index = number_of_samples - val_size
    Y = np.zeros(shape=(val_size, max_len, vocab_size), dtype="float32")
    for i, sequences in enumerate(y[validation_index:]):
        for j, sequence in enumerate(sequences):
            Y[i][j][sequence] = 1
            
    return ([X_encoder[validation_index:], X_decoder[validation_index:]], Y)

In [13]:
epochs = 5
batch_size = 1

In [14]:
train_gen = train_generator(batch_size)   # we have 513 batches of 405 samples (513 x 405 => 207765 + 13851 => 221616)
val_gen = validation_generator(number_of_val_samples)

# history is used for plotting
history = model.fit_generator(
    generator=train_gen,
    steps_per_epoch=int(number_of_train_samples/batch_size),
    epochs=epochs,
    validation_data=val_gen,
    shuffle=False
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
  4107/207765 [..............................] - ETA: 1:11:17 - loss: 2.0928 - acc: 0.6795

KeyboardInterrupt: 

### Save Model

In [15]:
# Save model Architecture

# save as JSON
json_string = model.to_json()
open('model_architecture.json', 'w').write(json_string)

  '. They will not be included '


3350

In [None]:
## Save the whole model

model.save('model.h5')

In [None]:
# Save model weights

model.save_weights('model_weights.h5')

In [None]:
# model.predict()