In [1]:
import tensorflow as tf
import numpy as np

import json

import os
#To free memory
import gc


In [2]:
tf.__version__
print(tf.config.experimental.list_physical_devices())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

#tf.debugging.set_log_device_placement(True)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]
Num GPUs Available:  1


In [3]:
if not "annotations" in os.listdir():
    if not "annotations_trainval2014.zip" in os.listdir():
        !wget "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
    !unzip annotations_trainval2014.zip

In [4]:
if not "train2014" in os.listdir():
    if not "train2014.zip" in os.listdir():
        !wget "http://images.cocodataset.org/zips/train2014.zip"
    !unzip train2014.zip

In [5]:
annotations_file = open("annotations/captions_train2014.json")
annotation_js = json.load(annotations_file)

In [6]:
annotation_list = [(j["image_id"],j["caption"]) for j in annotation_js["annotations"]]
annotation_list.sort()
annotation_list

data = [{"id" : annotation_list[0][0] , "captions" : [annotation_list[0][1]]}]

for img_id , capt in annotation_list[1:]:
    
    if data[-1]["id"] != img_id:
        data.append({"id" : img_id , "captions" : []})
        
    data[-1]["captions"].append(capt)
    
data

[{'id': 9,
  'captions': ['A bunch of trays that have different food.',
   'A meal is presented in brightly colored plastic trays.',
   'Closeup of bins of food that include broccoli and bread.',
   'Colorful dishes holding meat, vegetables, fruit, and bread.',
   'there are containers filled with different kinds of foods']},
 {'id': 25,
  'captions': ['A giraffe eating food from the top of the tree.',
   'A giraffe mother with its baby in the forest.',
   'A giraffe standing next to a forest filled with trees.',
   'A giraffe standing up nearby a tree ',
   'Two giraffes standing in a tree filled area.']},
 {'id': 30,
  'captions': ['A flower vase is sitting on a porch stand.',
   'A vase with red and white flowers outside on a sunny day.',
   'A white vase filled with different colored flowers.',
   'White vase with different colored flowers sitting inside of it. ',
   'a white vase with many flowers on a stage']},
 {'id': 34,
  'captions': ['A lone zebra grazing in some green grass.

## Preprocessing

In [7]:
#To have a small set, for fast experiments
data_small = data[:100]

### Text: Captions

In [8]:
annotation_list = ["<start> " + annotation[1] + " <end>" for annotation in annotation_list]

In [9]:
annotation_list

['<start> A bunch of trays that have different food. <end>',
 '<start> A meal is presented in brightly colored plastic trays. <end>',
 '<start> Closeup of bins of food that include broccoli and bread. <end>',
 '<start> Colorful dishes holding meat, vegetables, fruit, and bread. <end>',
 '<start> there are containers filled with different kinds of foods <end>',
 '<start> A giraffe eating food from the top of the tree. <end>',
 '<start> A giraffe mother with its baby in the forest. <end>',
 '<start> A giraffe standing next to a forest filled with trees. <end>',
 '<start> A giraffe standing up nearby a tree  <end>',
 '<start> Two giraffes standing in a tree filled area. <end>',
 '<start> A flower vase is sitting on a porch stand. <end>',
 '<start> A vase with red and white flowers outside on a sunny day. <end>',
 '<start> A white vase filled with different colored flowers. <end>',
 '<start> White vase with different colored flowers sitting inside of it.  <end>',
 '<start> a white vase wit

In [10]:
annotation_lens = [ len(ann.split()) for ann in annotation_list]
max(annotation_lens)

51

In [11]:
help(tf.keras.preprocessing.text.Tokenizer)

Help on class Tokenizer in module keras_preprocessing.text:

class Tokenizer(builtins.object)
 |  Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0, **kwargs)
 |  
 |  Text tokenization utility class.
 |  
 |  This class allows to vectorize a text corpus, by turning each
 |  text into either a sequence of integers (each integer being the index
 |  of a token in a dictionary) or into a vector where the coefficient
 |  for each token could be binary, based on word count, based on tf-idf...
 |  
 |  # Arguments
 |      num_words: the maximum number of words to keep, based
 |          on word frequency. Only the most common `num_words-1` words will
 |          be kept.
 |      filters: a string where each element is a character that will be
 |          filtered from the texts. The default is all punctuation, plus
 |          tabs and line breaks, minus the `'` character.
 |      lower: boolean

In [12]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(annotation_list)

In [13]:
# Recorver the word index that was created with the tokenizer
word_index = tokenizer.word_index
print('Found {} unique tokens.\n'.format(len(word_index)))
word_count = tokenizer.word_counts
print("Show the most frequent word index:")
for i, word in enumerate(sorted(word_count, key=word_count.get, reverse=True)):
    print('   {} ({}) --> {}'.format(word, word_count[word], word_index[word]))
    if i == 9: 
        print('')
        break

print("Show the least frequent word index:")
for i, word in enumerate(sorted(word_count, key=word_count.get, reverse=False)):
    print('   {} ({}) --> {}'.format(word, word_count[word], word_index[word]))
    if i == 9: 
        print('')
        break


Found 23685 unique tokens.

Show the most frequent word index:
   a (684598) --> 1
   <start> (414113) --> 2
   <end> (414113) --> 3
   on (150693) --> 4
   of (142768) --> 5
   the (137986) --> 6
   in (129002) --> 7
   with (107712) --> 8
   and (98854) --> 9
   is (68668) --> 10

Show the least frequent word index:
   ruched (1) --> 14300
   soldier's (1) --> 14301
   businessman's (1) --> 14302
   disinfecting (1) --> 14303
   clorox (1) --> 14304
   bottommost (1) --> 14305
   nashville (1) --> 14306
   tennessee (1) --> 14307
   allen (1) --> 14308
   hex (1) --> 14309



In [14]:
max_window = max(annotation_lens)
capt_train = tokenizer.texts_to_sequences(annotation_list)
capt_train = tf.keras.preprocessing.sequence.pad_sequences(capt_train,maxlen=max_window)
capt_train

array([[   0,    0,    0, ...,  191,   60,    3],
       [   0,    0,    0, ...,  492, 1076,    3],
       [   0,    0,    0, ...,    9,  438,    3],
       ...,
       [   0,    0,    0, ...,    6,  261,    3],
       [   0,    0,    0, ...,   47,  114,    3],
       [   0,    0,    0, ...,   70,  114,    3]], dtype=int32)

In [15]:
capt_mask = capt_train != 0
capt_mask

array([[False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True]])

### Images

In [15]:
def load_images(ids):
    images = []
    zero_padding = 12

    for idd in ids:

        img_id = str(idd)
        pic_name = "train2014/COCO_train2014_" + ("0"*(zero_padding - len(img_id))) + img_id + ".jpg"

        img = tf.keras.preprocessing.image.load_img(pic_name, target_size=(224, 224))
        pre_img = tf.keras.preprocessing.image.img_to_array(img)
        images.append(pre_img)
        
        
        
    np_images = np.array(images)
    del(images)
    pre_np_images = tf.keras.applications.resnet_v2.preprocess_input(np_images)
    del(np_images)
    
    return pre_np_images
    

# D1: Basic caption generation model
In this section we build a basic modern caption generation model. To generate captions out of images, we embrace the encoder-decoder architecture, which consists in encoding the image in a dense short vector (with a CNN for example) and decoding the vector as a sequence of words (using some sort of RNN, e.g. LSTMs).

...

## Encoder
We are going to use a State-Of-The-Art image classification network as our image encoder. The network in question is a CNN based residual net called ResNet.

In [16]:
resnet152 = tf.keras.applications.ResNet152V2(weights="imagenet") #load ResNet152, trained on imagenet
resnet152.summary()

Model: "resnet152v2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
pool1_pad (ZeroPadding2D)       (None, 114, 114, 64) 0           conv1_conv[0][0]                 
________________________________________________________________________________________

In [17]:
some_images = load_images([datum["id"] for datum in data_small])
pred = resnet152.predict(some_images)
pred_classes = tf.keras.applications.resnet_v2.decode_predictions(pred,top=3)
print(pred_classes)

# To free space, trust me you'll need this
%xdel pred
%xdel some_images
%xdel pred_classes

gc.collect()

[[('n07715103', 'cauliflower', 0.6370351), ('n07714990', 'broccoli', 0.35870326), ('n07711569', 'mashed_potato', 0.0042259563)], [('n02130308', 'cheetah', 0.85301125), ('n02127052', 'lynx', 0.111899145), ('n02128385', 'leopard', 0.02228276)], [('n04522168', 'vase', 0.8673538), ('n03991062', 'pot', 0.11452658), ('n03903868', 'pedestal', 0.0147273475)], [('n02391049', 'zebra', 1.0), ('n01518878', 'ostrich', 1.2668588e-10), ('n03447447', 'gondola', 1.0958564e-10)], [('n04507155', 'umbrella', 0.9989201), ('n03710721', 'maillot', 0.0008845), ('n03710637', 'maillot', 0.0001253053)], [('n02091831', 'Saluki', 0.3834884), ('n02090622', 'borzoi', 0.29409832), ('n02088094', 'Afghan_hound', 0.1393124)], [('n01580077', 'jay', 0.4168986), ('n02484975', 'guenon', 0.089360826), ('n02488291', 'langur', 0.081555486)], [('n02708093', 'analog_clock', 0.98425645), ('n04548280', 'wall_clock', 0.015669608), ('n02794156', 'barometer', 7.205418e-05)], [('n03272562', 'electric_locomotive', 0.406674), ('n0339391

9

In [18]:
#Remove top dense layer, wrap it in a model for convenience
resnet_encoder = tf.keras.Model(inputs=resnet152.input,outputs=resnet152.layers[-2].output)
resnet_encoder.trainable = False #Freeze layers
resnet_encoder.compile(loss=tf.keras.losses.CosineSimilarity()) # Needed for the freezing to make effect

#resnet_encoder.summary()

In [19]:
#Loading all the images at once and running the network on the whole thing is not feasible for most systems
#This function loads and extracts the features of the images by batches, removing images that have already been processed
def load_image_features_batched(image_batch_size = 4000):
    image_features = []

    for batch_i in range(0,len(data),image_batch_size):

        start_i = batch_i
        end_i = start_i + image_batch_size

        if (end_i > len(data)):
            end_i = len(data)

        ids = [datum["id"] for datum in data[start_i:end_i]]

        some_images = load_images(ids)
        pred = resnet_encoder.predict(some_images)
        image_features.append(pred)

        #Delete uneeded stuff
        gc.collect()
        
    image_features_array = np.concatenate(image_features) # All but last, since last shape might be different
    return image_features_array

In [20]:
if not "features" in os.listdir():
    os.mkdir("features")
    
if not "resnet_image_embdd.npz" in os.listdir("features"):
    image_features = load_image_features_batched()
    np.savez_compressed("features/resnet_image_embdd.npz",image_features)
    %xdel image_features
    gc.collect()

In [16]:
image_features_compressed = np.load("features/resnet_image_embdd.npz")

In [17]:
image_features = image_features_compressed["arr_0"]
image_features_compressed.close()
gc.collect()

68

## Decoder

In [16]:
hidden_len = 512
embedding_len = 512

dec_in_caption = tf.keras.Input(shape=(max_window))
#dec_in_img_enc = tf.keras.Input(shape=(embedding_len))
dec_in_cropped = dec_in_caption[:,:-1]
embdd = tf.keras.layers.Embedding(len(tokenizer.word_counts) + 1,embedding_len,input_length=max_window)(dec_in_cropped)

#img_enc = tf.keras.layers.Reshape((1,dec_in_img_enc.shape[1]))(dec_in_img_enc)
#padded = tf.keras.layers.Concatenate(axis=1)([img_enc,embdd])

#hidden = [resnet_enc.output , tf.keras.layers.InputLayer(input_tensor=tf.)]

lstm  = tf.keras.layers.LSTM(units=hidden_len,return_sequences=True,unroll=False)(embdd)
#cropped = tf.keras.layers.Cropping1D(cropping=(1,0))(lstm)
dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=len(tokenizer.word_counts),activation=tf.keras.activations.softmax))(lstm)

In [17]:
image_embedding_decoder = tf.keras.Model(inputs=[dec_in_caption],outputs=dense)
image_embedding_decoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 51)]              0         
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 512)           12127232  
_________________________________________________________________
lstm (LSTM)                  (None, 50, 512)           2099200   
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 23685)         12150405  
Total params: 26,376,837
Trainable params: 26,376,837
Non-trainable params: 0
_________________________________________________________________


In [27]:
def masked_cropped_sparse_categorical_crossentropy(mask_layer):
    
    def cropped_sparse_categorical_crossentropy(y_true,y_pred):
        y_true_cropped = y_true[:,1:]
        return tf.keras.losses.SparseCategoricalCrossentropy()(y_true_cropped,y_pred,sample_weight=mask_layer)
    
    return cropped_sparse_categorical_crossentropy

In [25]:
image_embedding_decoder.compile(loss=cropped_sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.SGD())

In [26]:
image_embedding_decoder.fit([capt_train],[capt_train], batch_size=128, epochs=20)

Train on 414113 samples
Epoch 1/20
 18560/414113 [>.............................] - ETA: 11:15 - loss: 2.3019

KeyboardInterrupt: 

In [30]:
gc.collect()

769

In [35]:
help(tf.keras.preprocessing.text.Tokenizer)

Help on class Tokenizer in module keras_preprocessing.text:

class Tokenizer(builtins.object)
 |  Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0, **kwargs)
 |  
 |  Text tokenization utility class.
 |  
 |  This class allows to vectorize a text corpus, by turning each
 |  text into either a sequence of integers (each integer being the index
 |  of a token in a dictionary) or into a vector where the coefficient
 |  for each token could be binary, based on word count, based on tf-idf...
 |  
 |  # Arguments
 |      num_words: the maximum number of words to keep, based
 |          on word frequency. Only the most common `num_words-1` words will
 |          be kept.
 |      filters: a string where each element is a character that will be
 |          filtered from the texts. The default is all punctuation, plus
 |          tabs and line breaks, minus the `'` character.
 |      lower: boolean

In [39]:
print(tokenizer.index_word)

