In [1]:
import numpy as np

from keras.models import Model, Sequential
from keras.layers import Input, Dense, BatchNormalization, RepeatVector, Concatenate, Merge, Masking
from keras.layers import LSTM, GRU, Embedding, TimeDistributed, Bidirectional
from keras import backend as K
from keras import optimizers
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
all_data = np.load('train_dev_test.npz')

In [3]:
train_encoder_output = all_data['train_encoder_output']
train_decoder_input = all_data['train_decoder_input']
train_decoder_target = all_data['train_decoder_target'][:,1:,:]
validation_encoder_output = all_data['validation_encoder_output']
validation_decoder_input = all_data['validation_decoder_input']
validation_decoder_target = all_data['validation_decoder_target'][:,1:,:]
test_encoder_output = all_data['test_encoder_output']
test_decoder_input = all_data['test_decoder_input']
test_decoder_target = all_data['test_decoder_target'][:,1:,:]

In [4]:
print("Train Encoder Output", train_encoder_output.shape)
print("Train Decoder Input", train_decoder_input.shape)
print("Train Decoder Target", train_decoder_target.shape)

Train Encoder Output (30000, 512)
Train Decoder Input (30000, 38)
Train Decoder Target (30000, 38, 2531)


In [5]:
from caption_utils import *
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)    
captions_data = (train_captions_raw.copy(), dev_captions_raw.copy(), test_captions_raw.copy())
train_captions, dev_captions, test_captions = process_captions(captions_data, token2idx)

In [6]:
encoder_model = load_model('saved_models/encoder_model.h5')
decoder_model = load_model('saved_models/decoder_model.h5')

  ' Found: ' + str(self.outputs))


In [7]:
def seq_to_sentence(sent):
    return ' '.join([idx2token[idx] for idx in sent])

In [83]:
def generate_seq(img_input, alpha=1.):
    if img_input.shape != (1, 512):
        img_input = img_input.reshape(1, 512)
    
    assert(img_input.shape == (1, 512))
    stop_condition = False
    decoded_sentence = []
    target_seq = np.array([token2idx['<bos>']]).reshape(1, 1)
    states_value = encoder_model.predict(img_input)
    
    neg_log_proba = 0.
    while not stop_condition:

        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        output_tokens = np.squeeze(output_tokens)
        
        sampled_token_index = int(np.argmax(output_tokens))
        neg_log_proba -= np.log(output_tokens[sampled_token_index])
        
        sampled_char = idx2token[sampled_token_index]

        decoded_sentence += [sampled_char]

        if (sampled_char == '<eos>' or len(decoded_sentence) > 30):
            stop_condition = True

        target_seq = np.array([sampled_token_index]).reshape(1, 1)

        states_value = [h, c]
        neg_log_proba /= len(decoded_sentence)**alpha
    return ' '.join(decoded_sentence[: -1])

In [84]:
def decoder_one_step(sent, beam_size=5, len_norm=True, alpha=1):
    """ 
    sent: ([neg_log_prob, [1, ...]], [h, c])
    states_value: [h, c]
    return list of sent
    """
    prev_log_prob = sent[0][0]
    prev_sent = sent[0][1]
    last_word_idx = prev_sent[-1]
    states_value = sent[1] 
    
    assert last_word_idx not in (token2idx['<eos>'], token2idx['<unk>']) 
    
    target_seq = np.array([last_word_idx]).reshape(1, 1)
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    output_tokens = np.squeeze(output_tokens)
    
    predicted_sentences = []    
#     output_tokens_beam = np.argpartition(-output_tokens, beam_size+4)
    output_tokens_beam = np.argsort(-output_tokens)
    output_tokens_beam = list(filter(lambda x: x not in [0, 1, 3], output_tokens_beam))[: beam_size]
    
    assert len(output_tokens_beam) == beam_size
    
    for predict_idx in output_tokens_beam:
#         if predict_idx in [0, 1, 3]:
#             continue
        
        new_sent = prev_sent + [int(predict_idx)]                
        
        if len_norm:
            neg_log_prob = prev_log_prob * max(len(prev_sent)-1, 1)**alpha - np.log(output_tokens[int(predict_idx)])
            neg_log_prob /= max(len(new_sent)-1, 1)**alpha
        else:
            neg_log_prob = prev_log_prob - np.log(output_tokens[int(predict_idx)])
            
        predicted_sentences.append(([neg_log_prob, new_sent], [h, c]))
        
#     print("from", sent[0][0], seq_to_sentence(sent[0][1]))
#     print("predicting")
#     for s in predicted_sentences:
#         print(s[0][0], seq_to_sentence(s[0][1]))
    
    return predicted_sentences

In [88]:
def beam_search(img_input, beam_size=5, max_length=20, len_norm=True, alpha=1.):
    """throws an error on beam_size 1 when <unk> is produced"""
    if img_input.shape != (1, 512):
        img_input = img_input.reshape(1, 512)    
    assert(img_input.shape == (1, 512))
    states_value_initial = encoder_model.predict(img_input)
    
    beg_sent_and_states = ([0., [token2idx['<bos>']]], states_value_initial)
#     print(beg_sent)
    top_sentences = decoder_one_step(beg_sent_and_states, beam_size, len_norm, alpha)
#     print(list(map(lambda x: seq_to_sentence(x[1]), top_sentences)))
    
    stop_condition = False
    
    while not stop_condition:
        new_top_sentences = []
        for sent in top_sentences:
            if sent[0][1][-1] == token2idx['<eos>']:
                new_top_sentences.append(sent)
                continue
                
            predicted_sent = decoder_one_step(sent, beam_size, len_norm, alpha)
            new_top_sentences.extend(predicted_sent)
            
        top_sentences = sorted(new_top_sentences, key=lambda x: x[0][0])[: beam_size]
        assert len(top_sentences) == beam_size

#         print(seq_to_sentence(top_sentences[0][1]))
        
        # Update stop condition
        eos_cnt = 0
        any_max_len = False
        for sent in top_sentences:
            if sent[0][1][-1] == token2idx['<eos>']:
                eos_cnt += 1
            if len(sent[0][1]) >= max_length:
                any_max_len = True
                print('Max len reached')
                break
        
        if any_max_len or (eos_cnt == beam_size):
            stop_condition = True        
            
    return list(map(lambda x: seq_to_sentence(x[0][1][1: -1]), top_sentences))

In [89]:
for i in range(1, 100, 5):
    print("Original\n", seq_to_sentence(np.argmax(test_decoder_target[i, :], -1)))
    print("Greedy\n", generate_seq(test_encoder_output[i, :], alpha=0.7))
    print("Beam Search")
    top_sentences = beam_search(test_encoder_output[i, :], beam_size=5, max_length=30, alpha=0.7)
    for sent in top_sentences:
        print(sent)
    print('*' * 25)

Original
 the dogs play on the snow <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Greedy
 two dogs are running in the snow
Beam Search
two brown dogs play in the snow
two dogs play in the snow
two brown dogs are running in the snow
two brown dogs are playing in the snow
a brown dog is running in the snow
*************************
Original
 a dog in a swimming pool swims toward <unk> <unk> <unk> see <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Greedy
 a dog with a blue ball in its mouth is laying on the ground
Beam Search
a dog with a blue ball in its mouth
a brown and white dog with a blue ball in its mouth
a white dog with a blue ball in its mouth
a brown and white dog with a blue ball in his mouth
a brown and white dog with a blue 

In [29]:
[1,2,3][:5]

[1, 2, 3]