In [1]:
import random
import argparse
import cv2
import pickle
import nltk
import numpy as np
from pycocotools.coco import COCO

from utils import print_progress_bar

## Load Dataset

In [2]:
def read_glove_vecs(glove_file):
    print('Creating word to vec map...')
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    print('Done!')
    return words_to_index, index_to_words, word_to_vec_map

In [4]:
# load embeddings
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('{}/glove.6B.50d.txt'.format('dataset'))

Creating word to vec map...
Done!


In [3]:
def create_caption_vector(caption, word_to_index):
    id_vector = []
    words = nltk.word_tokenize(caption.lower())
    for word in words:
        try:
            id_vector.append(word_to_index[word])
        except KeyError:
            id_vector.append(word_to_index['unk'])
    return id_vector

In [4]:
def map_image_to_caption(coco, word_to_index):
    """
    key: image id
    value: a caption of the image
    """
    image_to_caption = {}
    max_len = 0
    for img_id in coco.getImgIds():
        annotation_id = coco.getAnnIds(img_id)[random.randint(0, 4)]  # Take any one out of 5 captions
        caption = coco.loadAnns(annotation_id)[0]['caption']
        image_to_caption[img_id] = create_caption_vector(caption.lower(), word_to_index)
    max_len = len(max(image_to_caption.values(), key=len))
    return image_to_caption, max_len

In [5]:
def create_dataset(coco, word_to_index, image_path, img_size, dataset_size):
    x, y = [], []
    
    # Load image to caption map
    image_to_caption, max_len = map_image_to_caption(coco, word_to_index)

    # Initial call to print 0% progress
    print_progress_bar_counter = 0
    print_progress_bar(print_progress_bar_counter, dataset_size, prefix = 'Progress:', suffix = 'Complete', length = 50)

    image_to_caption_sampled = random.sample(image_to_caption.items(), dataset_size)  # shuffle items to reduce homogeneity
    for img_id, caption_vector in image_to_caption_sampled:
        # load image array
        img = coco.loadImgs([img_id])[0]
        img_array = cv2.imread('%s/%s' % (image_path, img['file_name']), cv2.IMREAD_GRAYSCALE)
        new_img_array = cv2.resize(img_array, (img_size, img_size))

        # store data in input and output vector
        x.append(new_img_array)
        y.append(caption_vector + [0] * (max_len - len(caption_vector)))

        # Update Progress Bar
        print_progress_bar_counter += 1
        print_progress_bar(print_progress_bar_counter, dataset_size, prefix = 'Progress:', suffix = 'Complete', length = 50)
    
    # convert to numpy array
    x = np.expand_dims(np.array(x, dtype=np.float32), axis=-1).astype('float32') / 255.
    y = np.array(y, dtype=np.int64)

    return x, y

In [6]:
# path to the directory where the data is stored
data_dir = 'dataset'

In [7]:
# initialize coco api
ann_file_train = '{}/annotations/captions_train2017.json'.format(data_dir)
ann_file_val = '{}/annotations/captions_val2017.json'.format(data_dir)

coco_train = COCO(ann_file_train)
coco_val = COCO(ann_file_val)

loading annotations into memory...
Done (t=10.37s)
creating index...
index created!
loading annotations into memory...
Done (t=0.45s)
creating index...
index created!


In [8]:
# load embeddings
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('{}/glove.6B.50d.txt'.format(data_dir))

In [9]:
# Load training data
x_train, y_train = create_dataset(coco_train, word_to_index, data_dir + '/train2017', 250, 5000)
print('\nx_train:', x_train.shape)
print('y_train:', y_train.shape)

Progress: |██████████████████████████████████████████████████| 100.0% Complete
x_train: (5000, 250, 250, 1)
y_train: (5000, 55)


In [10]:
# Load validation data
x_val, y_val = create_dataset(coco_val, word_to_index, data_dir + '/val2017', 250, 1500)
print('\nx_val:', x_val.shape)
print('y_val:', y_val.shape)

Progress: |██████████████████████████████████████████████████| 100.0% Complete
x_val: (1500, 250, 250, 1)
y_val: (1500, 36)


## Define Image Model

In [None]:
import keras.backend as K
from keras import layers, models, optimizers
from keras import callbacks
from PIL import Image

from utils import combine_images
from capsule_layers import CapsuleLayer, PrimaryCap, Length, Mask

K.set_image_data_format('channels_last')

In [None]:
input_shape = x_train.shape[1:]
n_class = 12
routings = 3

In [None]:
x = layers.Input(shape=input_shape)

In [None]:
# Layer 1-3: Just some conventional Conv2D layers
conv1 = layers.Conv2D(filters=96, kernel_size=13, strides=4, padding='valid', activation='relu', name='conv1')(x)
conv2 = layers.Conv2D(filters=96, kernel_size=5, strides=2, padding='valid', activation='relu', name='conv2')(conv1)
conv3 = layers.Conv2D(filters=256, kernel_size=9, strides=1, padding='valid', activation='relu', name='conv3')(conv2)
conv3

In [None]:
# Layer 4: Conv2D layer with `squash` activation, then reshape to [None, num_capsule, dim_capsule]
primary_caps = PrimaryCap(conv3, dim_capsule=8, n_channels=32, kernel_size=9, strides=2, padding='valid')
primary_caps

In [None]:
# Layer 5: Capsule layer. Routing algorithm works here.
caption_caps = CapsuleLayer(num_capsule=n_class, dim_capsule=16, routings=routings, name='caption_caps')(primary_caps)
caption_caps