In [None]:
## Try efficientnet in addition to InceptionV3 of the original example.
!pip install -q tensorflow==2.2-rc4 # fix TPU memory issue
!pip install -q efficientnet

N_VOCABS = 20000 # all vocabs of flickr30k is around 18k, so we choose them all -- if training loss does not work well, change to 5K

In [None]:
USE_PREVIOUS_SAVE = True

In [None]:
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
import gc
from glob import glob
from PIL import Image
import pickle
import pandas as pd

In [None]:
from kaggle_datasets import KaggleDatasets
import efficientnet.tfkeras as efn 
from tokenizers import ByteLevelBPETokenizer

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
LOCAL_FLICKR_PATH = '/kaggle/input/flickr-image-dataset/flickr30k_images/'
annotation_file = LOCAL_FLICKR_PATH + 'results.csv'
LOCAL_IMG_PATH = LOCAL_FLICKR_PATH + 'flickr30k_images/'

!ls {LOCAL_IMG_PATH} | wc

In [None]:
%%time
## This steps will take around 25 minutes offline ...
if strategy.num_replicas_in_sync == 8:
#     GCS_DS_PATH_FLICKR = KaggleDatasets().get_gcs_path('flickr8k-sau') # 2gb # 5 mins
    GCS_DS_PATH = KaggleDatasets().get_gcs_path('flickr-image-dataset') # 8gb # 20-25 mins
    print('yeah')

In [None]:
if strategy.num_replicas_in_sync == 8:
    # print(GCS_DS_PATH_FLICKR)
    # !gsutil ls $GCS_DS_PATH_FLICKR

    print(GCS_DS_PATH)
    !gsutil ls $GCS_DS_PATH
    
    FLICKR_PATH = GCS_DS_PATH + '/flickr30k_images/'
    IMG_PATH = FLICKR_PATH + 'flickr30k_images/'
    # less than 10sec
    !gsutil ls {IMG_PATH} | wc
else: 
    FLICKR_PATH = LOCAL_FLICKR_PATH
    IMG_PATH = LOCAL_IMG_PATH

In [None]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
    return all_desc

# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

In [None]:
df = pd.read_csv(annotation_file, delimiter='|') # a trick learned from other kernel
print(df.shape)
print(df.columns[2], df.columns[2] == ' comment') # wtf?
df[' comment'].values[0]
df.head(6)

In [None]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
START_TOKEN = '<start> '
END_TOKEN = ' <end>'

tokenizer = ByteLevelBPETokenizer(lowercase=True)
tokenizer

In [None]:
def add_start_end(text):
    return START_TOKEN + str(text) + END_TOKEN

df['comment'] = df[' comment'].progress_apply(add_start_end)
df.comment.values[:6]

In [None]:
## Don't need to do all_captions_dict anymore thanks to xhlulu "how to use Dataset" instead of DataGen
## https://www.kaggle.com/xhlulu/plant-pathology-very-concise-tpu-efficientnet
## If preparing captions_dict this will take 13 minutes!!!

# all_captions_dict = {} # for data generator : dict of list of all captions
full_img_name_list = [] # include gs path
# img_name_list = [] # only image name, maybe for easier future reference

for ii in tqdm_notebook(range(len(df))):
    full_image_path = IMG_PATH + df.image_name.values[ii]
    full_img_name_list.append(full_image_path)
                        
#     captions = df[df['image_name']==name].comment.values
#     all_captions_dict[name] = captions

# len(all_captions_dict), len(full_img_name_list)

In [None]:
all_captions_list = list(df.comment.values)
print(len(all_captions_list), all_captions_list[:5])
print(full_img_name_list[:3])

In [None]:
import gc
gc.collect()

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

from gensim.models import KeyedVectors
import gensim
def build_matrix(word_index, embedding_index, vec_dim):
    
    num_unk = 0
    
    emb_mean, emb_std = -0.0033470048, 0.109855264
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(word_index) + 1,vec_dim))
#     embedding_matrix = np.zeros((len(word_index) + 1, vec_dim))
    for word, i in word_index.items():
        known = False
        for candidate in [word, word.lower(), word.upper(), word.capitalize(), 
                          ps.stem(word), lc.stem(word), sb.stem(word) ]:
            if candidate in embedding_index:
                embedding_matrix[i] = embedding_index[candidate]
                known = True
                break
        if known == False: num_unk += 1
    
    print('number of unknown words is ', num_unk)
    return embedding_matrix

In [None]:
%%time
EMBEDDING_FILES = [
    '../input/gensim-embeddings-dataset/crawl-300d-2M.gensim',
    '../input/gensim-embeddings-dataset/glove.840B.300d.gensim'
]
glove_model = gensim.models.KeyedVectors.load(EMBEDDING_FILES[1], mmap='r')
gensim_words = glove_model.index2word
print(len(gensim_words), gensim_words[:20])
# How to use
print(glove_model['the'].shape)
'the' in glove_model

In [None]:
# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
%%time
# Choose the top_k words from the vocabulary
top_k = N_VOCABS 
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ') # note 'a'
tokenizer.fit_on_texts(all_captions_list)
train_seqs = tokenizer.texts_to_sequences(all_captions_list)

tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [None]:
# make list from dict
tokenizer.index2word = [tokenizer.index_word[ii] for ii in range(len(tokenizer.word_index)) ] 
print(tokenizer.index2word[:20]) # see top-20 most frequent words
print(tokenizer.index2word[-20:]) # these all come to <unk>
len(tokenizer.index2word)

In [None]:
print(tokenizer.index_word.get(2000, tokenizer.word_index['<end>']))
print(tokenizer.index_word.get(19999, tokenizer.word_index['<end>']))
print(tokenizer.word_index['<end>'])

In [None]:
len_cap = np.array([len(text.split()) for text in all_captions_list])
print(len_cap.mean(), len_cap.std(), len_cap.max(), len_cap.min())
max_seq_len = int(np.percentile(len_cap,99.9))

In [None]:
%%time
# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(all_captions_list)

# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post', maxlen = max_seq_len, truncating='post')

# Calculates the max_length, which is used to store the attention weights
max_length = calc_max_length(train_seqs) # TF2.1 official calculation --> strange to me, should base on cap_vector

In [None]:
lenx = np.array([len(x) for x in cap_vector])
print(lenx.min(), lenx.mean(), cap_vector[0])
print(max_length)
max_length = max_seq_len
print(max_length)

In [None]:
from sklearn.model_selection import KFold, GroupKFold
# Create training and validation sets using an train_test_split --> Here not use, avoid leakage of the same name, using GroupKFolds
# img_name_train, img_name_val, cap_train, cap_val = train_test_split(full_img_name_list,
#                                                                     cap_vector,
#                                                                     test_size=0.2,
#                                                                     random_state=0)

# 2.5% valid = 3975 captions = 795 images
kf = GroupKFold(n_splits=40).split(X=full_img_name_list, groups=full_img_name_list)

for ind, (tr, val) in enumerate(kf):
    img_name_train = np.array(full_img_name_list)[tr] # np.array make indexing possible
    img_name_val = np.array(full_img_name_list)[val]
    
    cap_train =  cap_vector[tr]
    cap_val =  cap_vector[val]
    break

In [None]:
print(img_name_train[:6],'\n')
print(cap_train[:6],'\n')
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

In [None]:
target_size = (299, 299,3)
AUTO = tf.data.experimental.AUTOTUNE

def decode_image(filename, label=None, image_size=(target_size[0],target_size[1])):
    means = [0.485, 0.456, 0.406]
    stds = [0.229, 0.224, 0.225]
    
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    
#     image = (tf.cast(image, tf.float32) / 127.5) - 1
    image = (tf.cast(image, tf.float32) / 255.0)
    image = (image - means) / stds # for qubvel EfficientNet
    
    image = tf.image.resize(image, image_size)
    
    if label is None:
        return image
    else:
        return image, label

def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
#     image = tf.image.random_flip_up_down(image)
    
    if label is None:
        return image
    else:
        return image, label

In [None]:
# Feel free to change these parameters according to your system's configuration
LR = 3e-4
BATCH_SIZE = 64 * strategy.num_replicas_in_sync
if strategy.num_replicas_in_sync == 1:
    BATCH_SIZE = 1

BUFFER_SIZE = 1000
embedding_dim = 300 #embedding_matrix.shape[1] # 300 for Glove
units = 512
vocab_size = top_k + 1 # <unk>

## OLD VERSION, in this new version, this shape will be determined automatically
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
# features_shape = 2048
# attention_features_shape = bf.shape[0] # 64 for InceptionV3, 100 for B1

attention_features_shape = 100
attention_viz_dim = 10 # 8 for inceptionV3

In [None]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden):
    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

    # hidden shape == (batch_size, hidden_size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
    hidden_with_time_axis = tf.expand_dims(hidden, 1)

    # score shape == (batch_size, 64, hidden_size)
    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

    # attention_weights shape == (batch_size, 64, 1)
    # you get 1 at the last axis because you are applying score to self.V
    attention_weights = tf.nn.softmax(self.V(score), axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * features
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        
        
        self.cnn0 = efn.EfficientNetB3(weights='noisy-student', 
                                      input_shape=target_size, include_top=False)
        
        
        # e.g. layers[-1].output = TensorShape([None, 10, 10, 1536]) for B3 (not global pooling)
        self.cnn = tf.keras.Model(self.cnn0.input, self.cnn0.layers[-1].output) 
        self.cnn.trainable = False
        
        # shape after fc == (batch_size, attention_features_shape, embedding_dim) >> this is my mistake, should be hidden instead of embedding_dim
        self.fc = tf.keras.layers.Dense(embedding_dim)
        
    # here, x is img-tensor of target_size
    def call(self, x):
        x = self.cnn(x) # 4D
        x = tf.reshape(x, (x.shape[0], -1, x.shape[3]) ) # 3D
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class RNN_Decoder(tf.keras.Model):
  def __init__(self, embedding_matrix, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units
    
    self.vocab_size = embedding_matrix.shape[0]
    
    # new interface of pretrained embedding weights : https://github.com/tensorflow/tensorflow/issues/31086
    # see also : https://stackoverflow.com/questions/55770009/how-to-use-a-pre-trained-embedding-matrix-in-tensorflow-2-0-rnn-as-initial-weigh
    self.embedding = tf.keras.layers.Embedding(self.vocab_size, embedding_matrix.shape[1], 
                                               embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), 
                                               trainable=False,
                                               mask_zero=True)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.units)
  
  # x=sequence of words
  # features=image's extracted features 
  # hidden=GRU's hidden unit
  def call(self, x, features, hidden):
    
    context_vector, attention_weights = self.attention(features, hidden)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # shape == (batch_size, max_length, hidden_size)
    x = self.fc1(output)

    # x shape == (batch_size * max_length, hidden_size)
    x = tf.reshape(x, (-1, x.shape[2]))

    # output shape == (batch_size * max_length, vocab)
    x = self.fc2(x)

    return x, state, attention_weights

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

In [None]:
with strategy.scope():
    # tf.keras.backend.clear_session()
    embedding_matrix = build_matrix(tokenizer.word_index, glove_model, embedding_dim)
    print(embedding_matrix.shape) # if not use stop-stem trick, num of unknowns is 495 (vs. current 287)
    
    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_matrix, units, vocab_size)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none') 
    # Set reduction to `none` so we can do the reduction afterwards and divide by
    # global batch size.

    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
        
        # About why we use `tf.nn.compute_average_loss`, please check this tutorial
        # https://www.tensorflow.org/tutorials/distribute/custom_training#define_the_loss_function
#         loss_ = tf.reduce_mean(loss_)
        loss_ = tf.nn.compute_average_loss(loss_, global_batch_size=BATCH_SIZE)
        
        return loss_

In [None]:
with strategy.scope():
    checkpoint_path = "./checkpoints/train"
    ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

gc.collect()

In [None]:
def get_training_dataset():
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((img_name_train, cap_train))
        .map(decode_image, num_parallel_calls=AUTO)
        .cache()
        .map(data_augment, num_parallel_calls=AUTO)
        .repeat() # Maybe not repeat in custom training (so when and how??) <-- the current version is bug because it repeat indefinitely
        .shuffle(BATCH_SIZE*8, reshuffle_each_iteration=True)
        .batch(BATCH_SIZE, drop_remainder=False)
        .prefetch(AUTO)
    )
    return strategy.experimental_distribute_dataset(train_dataset)


# if use keras.model.fit, no need for repeat and drop_remainder
valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((img_name_val, cap_val))
    .map(decode_image, num_parallel_calls=AUTO)
#     .repeat()
    .batch(BATCH_SIZE, drop_remainder=True)
    .cache()
    .prefetch(AUTO)
)

valid_dist_dataset = strategy.experimental_distribute_dataset(valid_dataset)
with strategy.scope():
    @tf.function
    def train_step(img_tensor, target):
        loss = 0

        # initializing the hidden state for each batch
        # because the captions are not related from image to image
        hidden = decoder.reset_state(batch_size=target.shape[0])

        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * target.shape[0], 1)

        with tf.GradientTape() as tape:
            features = encoder(img_tensor)

            for i in range(1, target.shape[1]):
                # passing the features through the decoder
                predictions, hidden, _ = decoder(dec_input, features, hidden)

                loss += loss_function(target[:, i], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(target[:, i], 1)

        total_loss = (loss / int(target.shape[1]))

        trainable_variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, trainable_variables)

        optimizer.apply_gradients(zip(gradients, trainable_variables))

        return loss, total_loss
    
    @tf.function
    def distributed_train_step(inputs):

        (images, labels) = inputs
#         loss = strategy.experimental_run_v2(train_step, args=(images, labels))
        loss = strategy.run(train_step, args=(images, labels))
        
        return loss
with strategy.scope():
    valid_loss = tf.keras.metrics.Sum()
    
    @tf.function 
    def val_step(img_tensor, target, teacher_forcing=True):
        # Non-teacher-forcing val_loss is too complicated at the moment
        loss = 0
#         print(target.shape) # (batch, 47) >> strange that we get None
        batch = target.shape[0] # BATCH_SIZE//strategy.num_replicas_in_sync #
        hidden = decoder.reset_state(batch_size= batch)
#         print(hidden.shape) # (batch,512)
        
        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * batch, 1)
      #   print(dec_input.shape) # (BATCH_SIZE, 1)
        features = encoder(img_tensor)
      #   print(features.shape) # (BATCH_SIZE, IMG_FEAT_LEN, ENCODER_HID) = 64 100 256
        for i in range(1, target.shape[1]):
            predictions, hidden, _ = decoder(dec_input, features, hidden)
            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

        avg_loss = (loss / int(target.shape[1]))
        return loss, avg_loss
    

    @tf.function
    def cal_val_loss(val_dataset):
        # target.shape = (64,49) = (Per Replica BATCH_SIZE?, SEQ_LEN)
        val_num_steps = len(img_name_val) // BATCH_SIZE
        valid_data_iter = iter(val_dataset)
        valid_loss.reset_states()
        
        total_loss = 0.0
        for ii in tf.range(val_num_steps):
            _, per_replica_val_loss = strategy.run(val_step, args=next(valid_data_iter))
            t_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_val_loss, axis=None)
            total_loss += t_loss
#             print(total_loss)
            
        valid_loss.update_state(total_loss/val_num_steps)
#         tf.print('val loss',valid_loss.result().numpy())
#             tf.print(total_loss)
#         tf.print ('Valid Loss -- %4f' % (total_loss.eval()/val_num_steps) )
        return total_loss/val_num_steps
    

In [None]:
# if USE_PREVIOUS_SAVE: # 
#     print('Use prev. save weights, so make this cell error')
#     %%time

with strategy.scope():
    loss_plot = []
    val_loss_plot = []
    EPOCHS = 20 # 1st epoch takes 1hour, after that with cache power, it's just 3-4 mins /epoch
    best_val_loss = 100
    start_epoch = 0
    num_steps = len(img_name_train) // (BATCH_SIZE)
    start = time.time()
    total_loss = 0
    epoch = 0
    train_dist_dataset = get_training_dataset()
    
    if USE_PREVIOUS_SAVE: # 
        print('Use prev. save weights, so run for few epochs')
        EPOCHS,num_steps = 1,1
        
    num_steps_accum = num_steps
    print(num_steps, BATCH_SIZE, num_steps*BATCH_SIZE)
    
    for (batch, inputs) in tqdm_notebook(enumerate(train_dist_dataset)): # by .repeat() this will indefinitely run
            
        if batch >= num_steps_accum:
            epoch += 1
            print('end of epoch ', epoch)
            
            loss_plot.append(total_loss / num_steps_accum)    
            print ('Epoch {} Loss {:.6f}'.format(epoch,
                                         total_loss/num_steps_accum))
            print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
            
            if num_steps_accum > num_steps*EPOCHS:
                print('end of training!!')
                break

            num_steps_accum += num_steps
            print('next numsteps ', num_steps_accum)

                
        # unsupported operand type(s) for +=: 'int' and 'PerReplica'
        _, per_replica_train_loss = distributed_train_step(inputs)
        t_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_train_loss,
                         axis=None)
            
        total_loss += t_loss
            
        if batch % 50 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, t_loss.numpy() ))

            val_loss = cal_val_loss(valid_dist_dataset)
            val_loss_plot.append(val_loss)
            
            print('val result', val_loss.numpy())
            if val_loss.numpy() < best_val_loss:
                print('update best val loss from %.4f to %.4f' % (best_val_loss, val_loss.numpy()))
                best_val_loss = val_loss.numpy()
                encoder.save_weights('encoder_best.h5')
                decoder.save_weights('decoder_best.h5')
#                 ckpt_manager.save()

In [None]:
if USE_PREVIOUS_SAVE:
    %%time

print(total_loss, t_loss)

plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Train Loss')
plt.title('Loss Plot')
plt.show()

# plt.plot(loss_plot)
plt.plot(val_loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Val Loss')
plt.title('Loss Plot')
plt.show()

In [None]:
if USE_PREVIOUS_SAVE:
    '''
    ## build construct input_layer, otherwise there is no input_layer and we cannot load weights
    encoder.build(input_shape = (BATCH_SIZE,299,299,3))

    #>> I don't know how to use model.build with multiple inputs
    #>> So, I have to use functional API, and manually specify input tensor
    # >> still error
    decoder_layer = RNN_Decoder(embedding_matrix, units, vocab_size)
    inp1 = tf.keras.layers.Input(shape=(1,))
    inp2 = tf.keras.layers.Input(shape=(attention_features_shape,embedding_dim,))
    inp3 = tf.keras.layers.Input(shape=(units,))
    decoder_out = decoder_layer(inp1,inp2,inp3)
    decoder = tf.keras.Model(inputs=[inp1,inp2,inp3],outputs=decoder_out)
    '''
    PATH = '/kaggle/input/image-caption-tf21-v12/'
    with strategy.scope():
        try:
            encoder.load_weights(PATH+'encoder_best.h5')
            decoder.load_weights(PATH+'decoder_best.h5') 
            # trick still fails due to layer mismatched when call(), have to construct with functional API exactly like subclass
#             decoder.layers[-1].load_weights(PATH+'decoder_best.h5') # trick to load into layers,see decoder.summary()
            print(1)
        except:
            encoder.load_weights(PATH+'encoder.h5')
            decoder.load_weights(PATH+'decoder.h5')
#             decoder.layers[-1].load_weights(PATH+'decoder.h5')
            print(2)

In [None]:
encoder.save_weights('encoder.h5')
decoder.save_weights('decoder.h5')
!ls -sh

In [None]:
def show_image(image,figsize=None,title=None):
    
    if figsize is not None:
        fig = plt.figure(figsize=figsize)
        
    if image.ndim == 2:
        plt.imshow(image,cmap='gray')
    else:
        plt.imshow(image)
        
    if title is not None:
        plt.title(title)
        
def show_Nimages(imgs,scale=1):

    N=len(imgs)
    fig = plt.figure(figsize=(25/scale, 16/scale))
    for i, img in enumerate(imgs):
        ax = fig.add_subplot(1, N, i + 1, xticks=[], yticks=[])
        show_image(img)
    plt.show()

In [None]:
def evaluate(image):
    attention_plot = np.zeros((max_length, attention_features_shape))
    
    try:
        hidden = decoder.reset_state(batch_size=1)
    except:
        hidden = decoder.layers[-1].reset_state(batch_size=1)
        
    img_tensor_val = tf.expand_dims(decode_image(image), 0)
#     print(img_tensor_val.shape)
    features = encoder(img_tensor_val)
#     print(features.shape)
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []

    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            return result, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot
def plot_attention(image, result, attention_plot):
    
    bits = tf.io.read_file(image)
    image = tf.image.decode_jpeg(bits, channels=3)
    
    temp_image = np.array(image)

    fig = plt.figure(figsize=(10, 10))

    len_result = len(result)
    for l in range(len_result):
        temp_att = np.resize(attention_plot[l], (attention_viz_dim, attention_viz_dim))
        ax = fig.add_subplot(len_result//2, len_result//2, l+1)
        ax.set_title(result[l])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()
    
    return temp_image
def print_all_captions(img_list, caps, rid):
    orig = img_list[rid]
    for rr in range(rid-5, rid+5):
        image_name = img_list[rr]
        if image_name == orig:
            real_caption = ' '.join([tokenizer.index_word[i] for i in caps[rr] if i not in [0]])
            print ('Real Caption:', real_caption)
    return 0
# captions on the train set
imgs = []
for ii in range(6):
    rid = np.random.randint(0, len(img_name_train))
    print_all_captions(img_name_train,cap_train,rid)
    image = img_name_train[rid]
    result, attention_plot = evaluate(image)
    print ('Prediction Caption:', ' '.join(result))
    img = plot_attention(image, result, attention_plot)
    imgs.append(img)
    if (ii+1) %2 == 0:
        show_Nimages(imgs)
        imgs = []

In [None]:
# captions on the validation set
imgs = []
for ii in range(6):
    rid = np.random.randint(0, len(img_name_val))
    print_all_captions(img_name_val,cap_val,rid)
    image = img_name_val[rid]
    result, attention_plot = evaluate(image)
    print ('Prediction Caption:', ' '.join(result))
    img = plot_attention(image, result, attention_plot)
    imgs.append(img)
    if (ii+1) %2 == 0:
        show_Nimages(imgs)
        imgs = []

In [None]:
# import gc
# del dataset
gc.collect()

In [None]:
def gen_cap(image):
    attention_plot = np.zeros((max_length, attention_features_shape))
    hidden = decoder.reset_state(batch_size=1)
    img_tensor_val = tf.expand_dims(decode_image(image), 0)
    features = encoder(img_tensor_val)
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []

    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)
        predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
        word = tokenizer.index_word.get(predicted_id, tokenizer.word_index['<end>'])
        result.append(word)
        if word == '<end>':
            return result

        dec_input = tf.expand_dims([predicted_id], 0)

    return result

In [None]:
len(img_name_train),len(img_name_val)
START = 120000
END = 150000

In [None]:
# captions on the validation set
imgs = []
real_caps, pred_caps = [], []
for rid in tqdm_notebook(range(START, END)): # 100 captions / 1:05 >> 10000 caps / 110mins >> 30,000 / 330mins+30min(preparing) = 6hours
    image = img_name_train[rid]
    result = gen_cap(image)
    
    real_caps.append(' '.join([tokenizer.index_word[i] for i in cap_train[rid] if i not in [0]]))
    pred_caps.append(' '.join(result))

In [None]:
# real_caps, pred_caps
np.savetxt('real_caps.txt', real_caps, fmt='%s')
np.savetxt('pred_caps.txt', pred_caps, fmt='%s')

In [None]:
!cat real_caps.txt | head
!cat pred_caps.txt | head

In [None]:
def show_image(image,figsize=None,title=None):
    
    if figsize is not None:
        fig = plt.figure(figsize=figsize)
        
    if image.ndim == 2:
        plt.imshow(image,cmap='gray')
    else:
        plt.imshow(image)
        
    if title is not None:
        plt.title(title)

In [None]:
import PIL # We will import the packages at "use-time (just for this kernel)

PIL.Image.open("../input/sample-img/Parisgesch1.JPG")

In [None]:
gen_cap("../input/sample-img/Parisgesch1.JPG")

It's not perfect i know :P