## Requirement Setup
In this section the requirements for running the notebook are leveraged

In [1]:
#!nvidia-smi

In [2]:
#!pip install tensorflow-gpu==2.0

In [1]:
import tensorflow as tf
import numpy as np

import json

import os
#To free memory
import gc


In [None]:
print(tf.__version__)
print(tf.config.experimental.list_physical_devices())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

#tf.debugging.set_log_device_placement(True)

2.0.0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]
Num GPUs Available:  1


## Data manipulation
### Data Obtention

In [3]:
if not "annotations" in os.listdir():
    if not "annotations_trainval2014.zip" in os.listdir():
        !wget "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
    !unzip annotations_trainval2014.zip

In [4]:
if not "val2014" in os.listdir():
    if not "val2014.zip" in os.listdir():
        !wget "http://images.cocodataset.org/zips/val2014.zip"
    !unzip val2014.zip

In [5]:
def load_annotation_data(annotation_filename):
    annotations_file = open(annotation_filename)
    annotations_js = json.load(annotations_file)
    
    annotation_list = [(j["image_id"],j["caption"]) for j in annotations_js["annotations"]]
    annotation_list.sort()

    data = [{"id" : annotation_list[0][0] , "captions" : [annotation_list[0][1]]}]

    for img_id , capt in annotation_list[1:]:

        if data[-1]["id"] != img_id:
            data.append({"id" : img_id , "captions" : []})

        data[-1]["captions"].append(capt)

    return data

In [6]:
annotations_data_train = load_annotation_data("annotations/captions_train2014.json")
annotations_data_val = load_annotation_data("annotations/captions_val2014.json")

In [7]:
print("Data as of the files:")
print()

print("Image count train: ",len(annotations_data_train))
print("Image count val: ",len(annotations_data_val))

Data as of the files:

Image count train:  82783
Image count val:  40504


In [8]:
val_size = 5000
test_size = 10000

annotations_data_test = annotations_data_val[:test_size]
annotations_data_val = annotations_data_val[test_size:]
annotations_data_train += annotations_data_val[val_size:]
annotations_data_val = annotations_data_val[:val_size]

In [9]:
print("Redistributed data:")
print()

print("Image count train: ",len(annotations_data_train))
print("Image count val: ",len(annotations_data_val))
print("Image count test: ",len(annotations_data_test))

Redistributed data:

Image count train:  108287
Image count val:  5000
Image count test:  10000


In [10]:
annotations_data_train

[{'id': 9,
  'captions': ['A bunch of trays that have different food.',
   'A meal is presented in brightly colored plastic trays.',
   'Closeup of bins of food that include broccoli and bread.',
   'Colorful dishes holding meat, vegetables, fruit, and bread.',
   'there are containers filled with different kinds of foods']},
 {'id': 25,
  'captions': ['A giraffe eating food from the top of the tree.',
   'A giraffe mother with its baby in the forest.',
   'A giraffe standing next to a forest filled with trees.',
   'A giraffe standing up nearby a tree ',
   'Two giraffes standing in a tree filled area.']},
 {'id': 30,
  'captions': ['A flower vase is sitting on a porch stand.',
   'A vase with red and white flowers outside on a sunny day.',
   'A white vase filled with different colored flowers.',
   'White vase with different colored flowers sitting inside of it. ',
   'a white vase with many flowers on a stage']},
 {'id': 34,
  'captions': ['A lone zebra grazing in some green grass.

### Data Preprocessing

#### Text: Captions

In [11]:
annotation_list_train = []
annotation_list_val = []
annotation_list_test = []

for data in annotations_data_train:
    annotation_list_train += data["captions"]
    
annotation_list_train = ["<start> " + annotation + " <end>" for annotation in annotation_list_train]

for data in annotations_data_val:
    annotation_list_val += data["captions"]
    
annotation_list_val = ["<start> " + annotation + " <end>" for annotation in annotation_list_val]
    
for data in annotations_data_test:
    annotation_list_test += data["captions"]

annotation_list_test = ["<start> " + annotation + " <end>" for annotation in annotation_list_test]

In [12]:
print("Caption count train: ",len(annotation_list_train))
print("Caption count val: ",len(annotation_list_val))
print("Caption count test: ",len(annotation_list_test))

Caption count train:  541714
Caption count val:  25021
Caption count test:  50032


In [13]:
annotation_list_train

['<start> A bunch of trays that have different food. <end>',
 '<start> A meal is presented in brightly colored plastic trays. <end>',
 '<start> Closeup of bins of food that include broccoli and bread. <end>',
 '<start> Colorful dishes holding meat, vegetables, fruit, and bread. <end>',
 '<start> there are containers filled with different kinds of foods <end>',
 '<start> A giraffe eating food from the top of the tree. <end>',
 '<start> A giraffe mother with its baby in the forest. <end>',
 '<start> A giraffe standing next to a forest filled with trees. <end>',
 '<start> A giraffe standing up nearby a tree  <end>',
 '<start> Two giraffes standing in a tree filled area. <end>',
 '<start> A flower vase is sitting on a porch stand. <end>',
 '<start> A vase with red and white flowers outside on a sunny day. <end>',
 '<start> A white vase filled with different colored flowers. <end>',
 '<start> White vase with different colored flowers sitting inside of it.  <end>',
 '<start> a white vase wit

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>", filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(annotation_list_train)

In [None]:
# Recorver the word index that was created with the tokenizer
word_index = tokenizer.word_index
word_count = tokenizer.word_counts

sorted_wc = sorted(word_count, key=word_count.get)

print('Found {} unique tokens.\n'.format(len(word_index)))

print("Show the most frequent word index:")
for i, word in enumerate(sorted_wc[::-1]):
    wi = word_index[word]
    wc = word_count[word]
    print('   {} ({}) --> {}'.format(word, wc, wi))
    if i == 20: 
        print('')
        break

print("Show the least frequent word index:")
for i, word in enumerate(sorted_wc):
    wi = word_index[word]
    wc = word_count[word]
    print('   {} ({}) --> {}'.format(word, wc, wi))
    if i == 20: 
        print('')
        break


In [18]:
#Trim by number of appearances
trimming_point = 1

trimmed_count = 0
trimmed_words = 0
for word in sorted_wc:
    wc = word_count[word]
    trimmed_count += wc
    trimmed_words +=1
    
    if wc > trimming_point:
        break
trimmed_words_appear_less = trimmed_words
trimmed_words_appear_less

10586

In [19]:
#Trim by appearance distribution proportion
trim_proportion = 0.01

total_wc = sum(tokenizer.word_counts.values())

to_trim = trim_proportion*total_wc
trimmed_count = 0
trimmed_words = 0

for word in sorted_wc:
    wc = word_count[word]
    trimmed_count += wc
    trimmed_words +=1
    
    if trimmed_count > to_trim:
        break
    
trimmed_words_proportional = trimmed_words
trimmed_words_proportional

21130

In [20]:
trimmed_words = trimmed_words_proportional

In [21]:
vocab_size = len(word_index) - trimmed_words
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size,oov_token="<unk>", filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(annotation_list_train)

In [22]:
embedding_count = vocab_size+1

In [23]:
annotation_lens_train = [ len(ann.split()) for ann in annotation_list_train]
max_window = max(annotation_lens_train)
max_window

52

In [24]:
capt_train = tokenizer.texts_to_sequences(annotation_list_train)
capt_train = tf.keras.preprocessing.sequence.pad_sequences(capt_train,maxlen=max_window)
capt_train

array([[   0,    0,    0, ...,  192,   62,    4],
       [   0,    0,    0, ...,  505, 1080,    4],
       [   0,    0,    0, ...,   10,  437,    4],
       ...,
       [   0,    0,    0, ...,   66,   36,    4],
       [   0,    0,    0, ...,   66,   36,    4],
       [   0,    0,    0, ...,  214,   36,    4]], dtype=int32)

In [25]:
capt_train2 = capt_train[:,1:]
capt_train2.shape

(541714, 51)

In [26]:
capt_val = tokenizer.texts_to_sequences(annotation_list_val)
capt_val = tf.keras.preprocessing.sequence.pad_sequences(capt_val,maxlen=max_window)
capt_val

array([[   0,    0,    0, ...,   61, 1717,    4],
       [   0,    0,    0, ...,    2,   25,    4],
       [   0,    0,    0, ...,   10,  209,    4],
       ...,
       [   0,    0,    0, ...,    2,  254,    4],
       [   0,    0,    0, ..., 4069,  254,    4],
       [   0,    0,    0, ...,    2,  951,    4]], dtype=int32)

In [27]:
capt_val2 = capt_val[:,1:]
capt_val2.shape

(25021, 51)

In [28]:
capt_test = tokenizer.texts_to_sequences(annotation_list_test)
capt_test = tf.keras.preprocessing.sequence.pad_sequences(capt_test,maxlen=max_window)
capt_test

array([[   0,    0,    0, ...,    7,  635,    4],
       [   0,    0,    0, ...,    7,  635,    4],
       [   0,    0,    0, ...,   10, 3093,    4],
       ...,
       [   0,    0,    0, ...,  211,   36,    4],
       [   0,    0,    0, ...,    7,  138,    4],
       [   0,    0,    0, ...,    2, 3047,    4]], dtype=int32)

In [29]:
capt_test2 = capt_test[:,1:]
capt_test2.shape

(50032, 51)

#### Images

In [7]:
def load_images(ids,folder):
    images = []
    zero_padding = 12

    for idd in ids:

        img_id = str(idd)
        pic_name = "{}/COCO_{}_".format(folder,folder) + ("0"*(zero_padding - len(img_id))) + img_id + ".jpg"

        img = tf.keras.preprocessing.image.load_img(pic_name, target_size=(299, 299))
        pre_img = tf.keras.preprocessing.image.img_to_array(img)
        images.append(pre_img)

        
        
    np_images = np.array(images)
    del(images)
    pre_np_images = tf.keras.applications.resnet_v2.preprocess_input(np_images)
    del(np_images)
    
    return pre_np_images
    

# D1: Basic caption generation model
In this section we build a basic modern caption generation model. To generate captions out of images, we embrace the encoder-decoder architecture, which consists in encoding the image in a dense short vector (with a CNN for example) and decoding the vector as a sequence of words (using some sort of RNN, e.g. LSTMs).

...

## Encoder
We are going to use a State-Of-The-Art image classification network as our image encoder. The network in question is a CNN based residual net called ResNet.

In [8]:
inceptionV3 = tf.keras.applications.InceptionV3(weights="imagenet") #load inceptionV3, trained on imagenet
inceptionV3.summary()

Model: "inception_v3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 149, 149, 32) 96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 149, 149, 32) 0           batch_normalization[0][0]        
_______________________________________________________________________________________

In [9]:
#Remove top dense layer, wrap it in a model for convenience
inception_encoder = tf.keras.Model(inputs=inceptionV3.input,outputs=inceptionV3.layers[-2].output)
inception_encoder.trainable = False #Freeze layers
inception_encoder.compile(loss=tf.keras.losses.CosineSimilarity()) # Needed for the freezing to make effect


In [10]:
#Loading all the images at once and running the network on the whole thing is not feasible for most systems
#This function loads and extracts the features of the images by batches, removing images that have already been processed
def load_image_features_batched(data,folder,image_batch_size = 2000,verbose=0):
    image_features = []

    for batch_i in range(0,len(data),image_batch_size):

        if verbose:
        
        start_i = batch_i
        end_i = start_i + image_batch_size

        if (end_i > len(data)):
            end_i = len(data)

        ids = [datum["id"] for datum in data[start_i:end_i]]

        some_images = load_images(ids,folder)
        pred = inception_encoder.predict(some_images)
        image_features.append(pred)

        #Delete uneeded stuff
        del(some_images)
        gc.collect()
        
    image_features_array = np.concatenate(image_features) # All but last, since last shape might be different
    return image_features_array

In [11]:
if not "features" in os.listdir():
    os.mkdir("features")
    
if not "image_embdd_train.npz" in os.listdir("features"):
    image_features_train = load_image_features_batched(annotations_data_train, "train2014")
    np.savez_compressed("features/image_embdd_train.npz",image_features_train)
    %xdel image_features_train
    gc.collect()
    
if not "image_embdd_val.npz" in os.listdir("features"):
    image_features_val = load_image_features_batched(
        annotations_data_val, "val2014")
    np.savez_compressed("features/image_embdd_val.npz",image_features_val)
    %xdel image_features_val
    gc.collect()



NameError: name 'annotations_data_test' is not defined

In [12]:
image_features_comp_train = np.load("features/image_embdd_train.npz")
image_features_comp_val = np.load("features/image_embdd_val.npz")

In [13]:
image_features_train = image_features_comp_train["arr_0"]
image_features_val = image_features_comp_val["arr_0"]
image_features_comp_train.close()
image_features_comp_val.close()
gc.collect()

56

In [8]:
repeats = [len(datum["captions"]) for datum in annotations_data_train]
image_features_train_rep = np.repeat(image_features_train, repeats, axis=0)
repeats = [len(datum["captions"]) for datum in annotations_data_val]
image_features_val_rep = np.repeat(image_features_val, repeats, axis=0)
repeats = [len(datum["captions"]) for datum in annotations_data_test]
image_features_test_rep = np.repeat(image_features_test, repeats, axis=0)

NameError: name 'annotations_data_train' is not defined

In [37]:
class LanguageModel(tf.keras.Model):
    
    def __init__(self, embedding_layer, recurrent_layer, top_layer):
        super(LanguageModel,self).__init__()
        self.embedding_layer = embedding_layer
        self.recurrent_layer = recurrent_layer
        self.top_layer = top_layer
        
    def call(self,captions):
        
        cropped_captions = captions[:,:-1]
        embeddings = self.embedding_layer(cropped_captions)
        hiddens = self.recurrent_layer(embeddings)
        output = self.top_layer(hiddens)
        
        return output

In [38]:
class EmbeddingLayer(tf.keras.layers.Layer):
        
        def __init__(self,max_window,embedding_count,embedding_len):
            super(EmbeddingLayer,self).__init__()
            self.embedding_layer = tf.keras.layers.Embedding(embedding_count,embedding_len,input_length=max_window,mask_zero=True)
            
        def call(self,x):
            return self.embedding_layer(x)
        
class LSTMLayer(tf.keras.layers.Layer):
        
        def __init__(self,hidden_len):
            super(LSTMLayer,self).__init__()
            self.lstm_layer = tf.keras.layers.LSTM(units=hidden_len,return_sequences=True)
            
        def call(self,x):
            return self.lstm_layer(x)
        
class MultilayerSoftmax(tf.keras.layers.Layer):
    
    def __init__(self,sizes):
            super(MultilayerSoftmax,self).__init__()
            
            self.hidden_layers = []
            self.softmax_layer = self.distributed_softmax = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=sizes[-1],activation=tf.keras.activations.softmax))
            
            for hidden in sizes[:-1]:
                self.hidden_layers += [tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=hidden,activation=tf.keras.activations.relu))]
            
        
    def call(self,x):
        
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
        
        x = self.softmax_layer(x)
        
        return x
        
        

In [39]:
embedding_layers = []
embedding_layers += [{"embedding_len":embdd} for embdd in [128,256,512]]

recurrent_layers = []
recurrent_layers += [{"model":"lstm" , "hidden_size": hidden} for hidden in [512, 1024, 2048, 4096]]

top_layers = []
top_layers += [{"model":"multiSoftmax" , "sizes": sizes} for sizes in 
               [[embedding_count], [embedding_count//2,embedding_count], [embedding_count//4,embedding_count//2,embedding_count]]]
top_layers

[{'model': 'multiSoftmax', 'sizes': [5433]},
 {'model': 'multiSoftmax', 'sizes': [2716, 5433]},
 {'model': 'multiSoftmax', 'sizes': [1358, 2716, 5433]}]

In [40]:
def cross_lists(listOfLists):
    
    if len(listOfLists) == 1:
        return [[lst] for lst in listOfLists[0]]
    
    crossed = cross_lists(listOfLists[1:])
    result = []
    
    for i in listOfLists[0]:
        for j in crossed:
            result.append([i]+j)
    
    return result
        

In [41]:
model_list = cross_lists([embedding_layers,recurrent_layers,top_layers])
len(model_list)

36

In [42]:
models = [{"embedding_layer":model[0],"recurrent_layer":model[1],"top_layer":model[2]} for model in model_list]
models

[{'embedding_layer': {'embedding_len': 128},
  'recurrent_layer': {'model': 'lstm', 'hidden_size': 512},
  'top_layer': {'model': 'multiSoftmax', 'sizes': [5433]}},
 {'embedding_layer': {'embedding_len': 128},
  'recurrent_layer': {'model': 'lstm', 'hidden_size': 512},
  'top_layer': {'model': 'multiSoftmax', 'sizes': [2716, 5433]}},
 {'embedding_layer': {'embedding_len': 128},
  'recurrent_layer': {'model': 'lstm', 'hidden_size': 512},
  'top_layer': {'model': 'multiSoftmax', 'sizes': [1358, 2716, 5433]}},
 {'embedding_layer': {'embedding_len': 128},
  'recurrent_layer': {'model': 'lstm', 'hidden_size': 1024},
  'top_layer': {'model': 'multiSoftmax', 'sizes': [5433]}},
 {'embedding_layer': {'embedding_len': 128},
  'recurrent_layer': {'model': 'lstm', 'hidden_size': 1024},
  'top_layer': {'model': 'multiSoftmax', 'sizes': [2716, 5433]}},
 {'embedding_layer': {'embedding_len': 128},
  'recurrent_layer': {'model': 'lstm', 'hidden_size': 1024},
  'top_layer': {'model': 'multiSoftmax', 's

In [43]:
def generate_model_name(model):
    
    embedding_layer = model["embedding_layer"]
    recurrent_layer = model["recurrent_layer"]
    top_layer = model["top_layer"]
    
    string = recurrent_layer["model"] + str(recurrent_layer["hidden_size"])
    string += "_embedd" + str(embedding_layer["embedding_len"])
    string += "_" + top_layer["model"] + "-".join([ str(hidd) for hidd in top_layer["sizes"]])
    
    return string

In [44]:
def build_embedding_layer(embedding_layer):
    return EmbeddingLayer(max_window=max_window,embedding_count=embedding_count,embedding_len=embedding_layer["embedding_len"])

def build_recurrent_layer(recurrent_layer):
    
    if (recurrent_layer["model"] == "lstm"):
        return LSTMLayer(hidden_len=recurrent_layer["hidden_size"])
    
def build_top_layer(top_layer):
    
    if (top_layer["model"] == "multiSoftmax"):
        return MultilayerSoftmax(sizes=top_layer["sizes"])

def build_model(model_desc):
    
    embedding_layer = model_desc["embedding_layer"]
    recurrent_layer = model_desc["recurrent_layer"]
    top_layer = model_desc["top_layer"]
    
    embedding = build_embedding_layer(embedding_layer)
    recurrent = build_recurrent_layer(recurrent_layer)
    top = build_top_layer(top_layer)
    
    model = LanguageModel(embedding,recurrent,top)
    model.build(input_shape=(None,max_window))
    return model
    

In [45]:
def train_model(model):

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss",patience=2)
    
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer = tf.keras.optimizers.Adam())
    history = model.fit(capt_train,capt_train2, validation_data=[capt_val,capt_val2],epochs=20,batch_size=64,callbacks = [early_stopping])
    
    return history

In [None]:
if not "models" in os.listdir():
    os.mkdir("models")
    
for model in models:
    model_name = generate_model_name(model) + ".h5"
    
    if not model_name in os.listdir("models"):
        built_model = build_model(model)
        
        print("Training model: ",model_name)
        
        train_model(built_model)
        built_model.save_weights("models/"+model_name)
        
        del(built_model)
        gc.collect()
        

Training model:  lstm512_embedd128_multiSoftmax5433.h5
Train on 541714 samples, validate on 25021 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Best: 

In [None]:
for embedding_size in [128, 256, 512]:
    embedding = EmbeddingLayer(max_window=max_window,embedding_count=embedding_count,embedding_len=embedding_size)
    recurrent = LSTMLayer(hidden_len=#!!!!)
    softmax = DenseSoftmax(embedding_count=embedding_count)
    image_embedding_decoder = LanguageModel(embedding,recurrent,softmax)
    image_embedding_decoder.build(input_shape=(None,max_window))
    image_embedding_decoder.summary()
    image_embedding_decoder.fit(capt_train,capt_train2, batch_size=64, epochs=10 ,validation_data=[capt_val,capt_val2],callbacks=[early_stopping])

In [None]:
# Byte-pair encoding

In [None]:
embedding = EmbeddingLayer(max_window=max_window,embedding_count=embedding_count,embedding_len=512)
recurrent = LSTMLayer(hidden_len=512)
softmax = DenseSoftmax(embedding_count=embedding_count)



In [None]:
image_embedding_decoder = LanguageModel(embedding,recurrent,softmax)
image_embedding_decoder.build(input_shape=(None,max_window))
image_embedding_decoder.summary()

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()

image_embedding_decoder.compile(loss=loss, optimizer=optimizer)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss",patience=3)

In [None]:
image_embedding_decoder.fit(capt_train,capt_train2, batch_size=64, epochs=10 ,validation_data=[capt_val,capt_val2],callbacks=[early_stopping])

In [None]:
image_embedding_decoder.save('model.h5')

In [None]:
image_embedding_decoder = tf.keras.models.load_model('model.h5')
#    custom_objects={'cropped_sparse_categorical_crossentropy': 
                    #cropped_sparse_categorical_crossentropy})

In [None]:
tf.keras.backend.clear_session()

In [None]:
#reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [None]:
capt_train[0]

In [None]:
sent_1 = image_embedding_decoder.predict(capt_train[0].reshape(1, -1))[0]

In [None]:
capt_train[0].shape

In [None]:
[(tokenizer.index_word[position1.argmax()], tokenizer.index_word[position2]) for position1, position2 in zip(sent_1[-9:], capt_train[0,-9:])]

In [None]:
for sentence in annotation_list_train[:10]:
    sentence = ' '.join(sentence.split()[:6])
    for _ in range(max_window):
        sent_tokenized = tokenizer.texts_to_sequences([sentence + " <end>" ])
        sent_padded = tf.keras.preprocessing.sequence.pad_sequences(
            sent_tokenized, maxlen=max_window)
        pred = image_embedding_decoder.predict(sent_padded)
        index = pred[0, -1].argmax()
        word = tokenizer.index_word[index]
        if word == "<end>":
            break
        sentence += " "+word
    print(sentence)

In [None]:
pred.shape

In [None]:
pred[0, -1].argmax()


In [None]:
reverse_word_map.get(4)

In [None]:
help(tf.keras.preprocessing.text.Tokenizer)

In [None]:
print(tokenizer.index_word)

## Decoder

In [None]:
class Decoder(tf.keras.Model):
    
    def __init__(self, dense, embedding_layer, recurrent_layer, top_layer):
        super(Decoder, self).__init__()
        self.dense = dense
        self.embedding_layer = embedding_layer
        self.recurrent_layer = recurrent_layer
        self.top_layer = top_layer
        
    def call(self, model_input):
        img_features = model_input[0]
        captions = model_input[1]
        img_embedding = self.dense(img_features)
        img_embedding = tf.expand_dims(img_embedding, 1)
        cropped_captions = captions[:, :-1]
        embeddings = self.embedding_layer(cropped_captions)
        rnn_input = tf.concat([img_embedding, embeddings], 1)
        hiddens = self.recurrent_layer(rnn_input)
        output = self.top_layer(hiddens)
        
        return output[:, 1:]

In [None]:
dense = tf.keras.layers.Dense(512)

embedding = tf.keras.layers.Embedding(embedding_count,512,input_length=max_window,mask_zero=True)
#embedding = EmbeddingLayer(max_window=max_window, 
#    embedding_count=embedding_count, embedding_len=512)
recurrent = tf.keras.layers.LSTM(units=512,return_sequences=True,unroll=False)
#recurrent = LSTMLayer(hidden_len=512)
softmax = tf.keras.layers.Dense(
    embedding_count, activation=tf.keras.activations.softmax)

In [None]:
decoder = Decoder(dense, embedding, recurrent, softmax)
#decoder.build(input_shape=(None,max_window))
#decoder.summary()

In [None]:
decoder.compile(loss=loss, optimizer=optimizer, sample_weight_mode="temporal")

In [None]:
start_index = word_index['<start>']
train_mask = np.logical_not(np.logical_or(
        capt_train2 == 0, capt_train2 == start_index))
val_mask = np.logical_not(np.logical_or(
        capt_val2 == 0, capt_val2 == start_index))

In [None]:
capt_train3 = np.expand_dims(capt_train2, -1)
capt_val3 = np.expand_dims(capt_val2, -1)

In [None]:
decoder.fit([image_features_train_rep, capt_train], 
            capt_train3, 
            batch_size=64, 
            epochs=50,
            #callbacks=[early_stopping], 
            sample_weight=train_mask)

In [None]:
tokenizer.index_word[29]

In [None]:
# random.randrange(len(image_features_train))
for index in range(0, 200, 5):
    image = image_features_train_rep[index]
    annotation = annotation_list_train[index]
    print("Train: ", annotation)
    #print(image[:5])
    sentence = '<start> A bunch of trays that'
    for _ in range(max_window):
        sent_tokenized = tokenizer.texts_to_sequences([sentence + " <end>"])
        sent_padded = tf.keras.preprocessing.sequence.pad_sequences(
            sent_tokenized, maxlen=max_window)
        pred = decoder.predict([np.expand_dims(image, 0), sent_padded])
        index = pred[0, -1].argmax()
        denak = pred[0, :].argmax(-1)
        #print(denak)
        word = tokenizer.index_word[index]
        if word == "<end>":
            break
        sentence += " "+word
    print("Pred:  ", sentence)