In [2]:
from keras.preprocessing import image
from keras.models import load_model
import numpy as np

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

from caption_utils import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def generate_seq(img_input):
    
    if img_input.shape != (1, 512):
        img_input = img_input.reshape(1, 512)

    
    assert(img_input.shape == (1, 512))
    stop_condition = False
    decoded_sentence = []
    target_seq = np.array([token2idx['<bos>']]).reshape(1, 1)
    states_value = encoder_model.predict(img_input)

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = int(np.argmax(output_tokens[0, -1, :]))
        sampled_char = idx2token[sampled_token_index]
        decoded_sentence += [sampled_char]
        if (sampled_char == '<eos>' or len(decoded_sentence) > 30):
            stop_condition = True
        target_seq = np.array([sampled_token_index]).reshape(1, 1)
        states_value = [h, c]

    return ' '.join(decoded_sentence[:-1])

In [4]:
def get_captions(model, img_path):   
    #img_path = 'data/Arnav_Hankyu_Pulkit2.jpg'
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    features = model.predict(x)
    return generate_seq(features)

In [5]:
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()
train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)

## VGG16

In [6]:
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input

encoder_model = load_model('saved_models/encoder_model.h5')
decoder_model = load_model('saved_models/decoder_model.h5')
VGG16_model = VGG16(weights='imagenet', include_top=False, pooling='avg')

  ' Found: ' + str(self.outputs))


In [7]:
bleu1 = np.zeros(len(test_fns_list))
bleu2 = np.zeros(len(test_fns_list))
bleu3 = np.zeros(len(test_fns_list))
bleu4 = np.zeros(len(test_fns_list))

for i, filename in enumerate(test_fns_list):
    if i%100 == 0:
        print(i, "images processed")
    candidates = [get_captions(VGG16_model, "data/Flicker8k_Dataset/"+filename).split()]*5
    references = []    
    for j, caption in enumerate(test_captions_raw[filename]):
        references.append(caption[:-1].split())
    bleu1[i] = corpus_bleu(references, candidates, weights=(1, 0, 0, 0))
    bleu2[i] = corpus_bleu(references, candidates, weights=(0, 1, 0, 0))
    bleu3[i] = corpus_bleu(references, candidates, weights=(0, 0, 1, 0))
    bleu4[i] = corpus_bleu(references, candidates, weights=(0, 0, 0, 1))

0 images processed


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


100 images processed
200 images processed
300 images processed
400 images processed
500 images processed
600 images processed
700 images processed
800 images processed
900 images processed


In [21]:
print("VGG16")
print("Bleu1 Score: ", bleu1.mean())
print("Bleu2 Score: ", bleu2.mean())
print("Bleu3 Score: ", bleu3.mean())
print("Bleu4 Score: ", bleu4.mean())

VGG16
Bleu1 Score:  0.10055779131835635
Bleu2 Score:  0.30610375207562346
Bleu3 Score:  0.44651866284028346
Bleu4 Score:  0.5399454288695043


# Run it until here

-------------


## VGG19

In [8]:
from keras.applications.vgg19 import VGG19
from keras.applications.vgg19 import preprocess_input

encoder_model = load_model('saved_models/encoder_model_VGG19.h5')
decoder_model = load_model('saved_models/decoder_model_VGG19.h5')
VGG19_model = VGG19(weights='imagenet', include_top=False, pooling='avg')

  ' Found: ' + str(self.outputs))


In [9]:
for i, filename in enumerate(test_fns_list):
    if i%100 == 0:
        print(i, "images processed")
    candidate = get_captions(VGG19_model, "data/Flicker8k_Dataset/"+filename).split()
    for j, caption in enumerate(test_captions_raw[filename]):
        reference = caption[:-1].split()
        bleu1[i*5+j] = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
        bleu2[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 1, 0, 0))
        bleu3[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 0, 1, 0))
        bleu4[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 0, 0, 1))  

0 images processed


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


100 images processed
200 images processed
300 images processed
400 images processed
500 images processed
600 images processed
700 images processed
800 images processed
900 images processed


In [10]:
print("VGG19")
print("Bleu1 Score: ", bleu1.mean())
print("Bleu2 Score: ", bleu2.mean())
print("Bleu3 Score: ", bleu3.mean())
print("Bleu4 Score: ", bleu4.mean())

VGG19
Bleu1 Score:  0.10948410832083721
Bleu2 Score:  0.9566502893814297
Bleu3 Score:  0.9566502893814297
Bleu4 Score:  0.9566502893814297


## ResNet50

In [11]:
def generate_seq(img_input):
    
    if img_input.shape != (1, 2048):
        img_input = img_input.reshape(1, 2048)

    
    assert(img_input.shape == (1, 2048))
    stop_condition = False
    decoded_sentence = []
    target_seq = np.array([token2idx['<bos>']]).reshape(1, 1)
    states_value = encoder_model.predict(img_input)

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = int(np.argmax(output_tokens[0, -1, :]))
        sampled_char = idx2token[sampled_token_index]
        decoded_sentence += [sampled_char]
        if (sampled_char == '<eos>' or len(decoded_sentence) > 30):
            stop_condition = True
        target_seq = np.array([sampled_token_index]).reshape(1, 1)
        states_value = [h, c]

    return ' '.join(decoded_sentence[:-1])

In [12]:
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input

encoder_model = load_model('saved_models/encoder_model_ResNet50.h5')
decoder_model = load_model('saved_models/decoder_model_ResNet50.h5')
ResNet50_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

  ' Found: ' + str(self.outputs))


In [13]:
for i, filename in enumerate(test_fns_list):
    if i%100 == 0:
        print(i, "images processed")
    candidate = get_captions(ResNet50_model, "data/Flicker8k_Dataset/"+filename).split()
    for j, caption in enumerate(test_captions_raw[filename]):
        reference = caption[:-1].split()
        bleu1[i*5+j] = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
        bleu2[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 1, 0, 0))
        bleu3[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 0, 1, 0))
        bleu4[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 0, 0, 1))  

0 images processed


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


100 images processed
200 images processed
300 images processed
400 images processed
500 images processed
600 images processed
700 images processed
800 images processed
900 images processed


In [14]:
print("ResNet50")
print("Bleu1 Score: ", bleu1.mean())
print("Bleu2 Score: ", bleu2.mean())
print("Bleu3 Score: ", bleu3.mean())
print("Bleu4 Score: ", bleu4.mean())

ResNet50
Bleu1 Score:  0.10003500434853214
Bleu2 Score:  0.9325552583614292
Bleu3 Score:  0.9325552583614292
Bleu4 Score:  0.9325552583614292


## Xception

In [15]:
from keras.applications.xception import Xception
from keras.applications.xception import preprocess_input

encoder_model = load_model('saved_models/encoder_model_ResNet50.h5')
decoder_model = load_model('saved_models/decoder_model_ResNet50.h5')
Xception_model = Xception(weights='imagenet', include_top=False, pooling='avg')

  ' Found: ' + str(self.outputs))


In [16]:
for i, filename in enumerate(test_fns_list):
    if i%100 == 0:
        print(i, "images processed")
    candidate = get_captions(Xception_model, "data/Flicker8k_Dataset/"+filename).split()
    for j, caption in enumerate(test_captions_raw[filename]):
        reference = caption[:-1].split()
        bleu1[i*5+j] = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
        bleu2[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 1, 0, 0))
        bleu3[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 0, 1, 0))
        bleu4[i*5+j] = sentence_bleu(reference, candidate, weights=(0, 0, 0, 1))  

0 images processed


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


100 images processed
200 images processed
300 images processed
400 images processed
500 images processed
600 images processed
700 images processed
800 images processed
900 images processed


In [17]:
print("Xception")
print("Bleu1 Score: ", bleu1.mean())
print("Bleu2 Score: ", bleu2.mean())
print("Bleu3 Score: ", bleu3.mean())
print("Bleu4 Score: ", bleu4.mean())

Xception
Bleu1 Score:  0.15455040708797751
Bleu2 Score:  0.9731898361621599
Bleu3 Score:  0.9731898361621599
Bleu4 Score:  0.9731898361621599
