In [None]:
!pip install -q tensorflow==2.2

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import re, string, time, random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
from kaggle_datasets import KaggleDatasets
import os, csv, collections

In [None]:
USE_PREVIOUS_SAVE = True
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
LOCAL_FLICKR_PATH = '/kaggle/input/flickr-image-dataset/flickr30k_images/'
annotation_file = LOCAL_FLICKR_PATH + 'results.csv'
LOCAL_IMG_PATH = LOCAL_FLICKR_PATH + 'flickr30k_images/'

!ls {LOCAL_IMG_PATH} | wc

In [None]:
%%time
if strategy.num_replicas_in_sync == 8:
    GCS_DS_PATH = KaggleDatasets().get_gcs_path('flickr-image-dataset') # 8gb # 20-25 mins
    print('yeah')

In [None]:
if strategy.num_replicas_in_sync == 8:
    # print(GCS_DS_PATH_FLICKR)
    # !gsutil ls $GCS_DS_PATH_FLICKR

    print(GCS_DS_PATH)
    !gsutil ls $GCS_DS_PATH
    
    FLICKR_PATH = GCS_DS_PATH + '/flickr30k_images/'
    IMG_PATH = FLICKR_PATH + 'flickr30k_images/'
    # less than 10sec
    !gsutil ls {IMG_PATH} | wc
else: 
    FLICKR_PATH = LOCAL_FLICKR_PATH
    IMG_PATH = LOCAL_IMG_PATH

In [None]:
image_caption = collections.defaultdict(list)
prepath='/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/flickr30k_images/'
anotation = '/kaggle/input/flickr-image-dataset/flickr30k_images/results.csv'
with open(anotation, 'r') as f:
    next(f)
    anotation_list = csv.reader(f)
    for row in anotation_list:
        split_str = ','.join(row).split('| ')
        caption = f"<start> {split_str[-1]} <end>"
        image_caption[IMG_PATH+split_str[0]].append(caption)

In [None]:
train_caption = []
train_path = []
val_caption = []
val_path = []
allkeys = list(image_caption.keys())
random.shuffle(allkeys)
keys = allkeys[:28000]
for i in keys:
    caption_list = image_caption[i]
    train_caption.extend(caption_list)
    train_path.extend([i]*len(caption_list))
val_keys = allkeys[28000:30000]
for i in val_keys:
    caption_list = image_caption[i]
    val_caption.extend(caption_list)
    val_path.extend([i]*len(caption_list))

In [None]:
#Captioning Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token='<unk>', filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
len_seq = [len(seq.split()) for seq in train_caption]
max_len=int(np.percentile(len_seq, 95))
tokenizer.fit_on_texts([list[0] for list in list(image_caption.values())])
train_sequences = tokenizer.texts_to_sequences(train_caption)
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'
train_sequences = pad_sequences(train_sequences, padding='post', maxlen=max_len, truncating='post')
val_sequences = tokenizer.texts_to_sequences(val_caption)
val_sequences = pad_sequences(val_sequences, padding='post', maxlen=max_len, truncating='post')

In [None]:
print(len(image_caption))
print(len(train_path))
print(len(train_sequences))
print(len(val_sequences))

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
embed_dims=256
units=512
vocab_size=len(tokenizer.word_index)+1
if strategy.num_replicas_in_sync == 1:
    BATCH_SIZE = 1
Batch_size = 64 * strategy.num_replicas_in_sync
num_steps = len(train_path)//Batch_size
val_num_steps = len(val_path)//Batch_size

In [None]:
@tf.function
def image_processing(path, label=None):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = (tf.cast(img, tf.float32) / 255.0)
    img = tf.image.resize(img, (299, 299))
    #img = tf.keras.applications.inception_v3.preprocess_input(img)
    
    if label is None:
        return img
    else:
        return img, label

In [None]:
@tf.function
def augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    if label is None:
        return image
    else:
        return image, label

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((train_path, train_sequences))
dataset = dataset.map(image_processing, num_parallel_calls=AUTO).cache()
dataset = dataset.map(augment, num_parallel_calls=AUTO).shuffle(Batch_size*8, reshuffle_each_iteration=True)
dataset = dataset.batch(Batch_size, drop_remainder=False).prefetch(AUTO)
dataset = strategy.experimental_distribute_dataset(dataset)

In [None]:
val_dataset = tf.data.Dataset.from_tensor_slices((val_path, val_sequences))
val_dataset = val_dataset.map(image_processing, num_parallel_calls=AUTO)
val_dataset = val_dataset.cache().batch(Batch_size, drop_remainder=False).prefetch(AUTO)
val_dataset = strategy.experimental_distribute_dataset(val_dataset)

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, embed_dims):
        super().__init__()
        self.V3 = tf.keras.applications.InceptionV3(include_top=False, weights="imagenet")
        self.model = tf.keras.Model(self.V3.input, self.V3.layers[-1].output)
        self.model.trainable = False
        self.Dense = tf.keras.layers.Dense(embed_dims)
    def call(self, x):
        x = self.model(x)
        x = tf.reshape(x, (x.shape[0], -1, x.shape[-1]))
        x = self.Dense(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super().__init__()
        self.units = units
        self.W1 = tf.keras.layers.Dense(self.units)
        self.W2 = tf.keras.layers.Dense(self.units)
        self.V = tf.keras.layers.Dense(1)
    def call(self, x, hidden):
        hidden_step_dim = tf.expand_dims(hidden, 1)
        attention_hidden = tf.nn.tanh(self.W1(x)+self.W2(hidden_step_dim))
        score = self.V(attention_hidden)
        weights = tf.nn.softmax(score, axis=1)
        context_vector = weights*x
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, weights

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, units, embed_dims, vocab_size):
        super().__init__()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dims)
        self.Dense1 = tf.keras.layers.Dense(self.units)
        self.Dense2 = tf.keras.layers.Dense(vocab_size)
        self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.attention = Attention(self.units)
    def call(self, encoder_output, x, hidden):
        context_vector, weights = self.attention(encoder_output, hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector,1), x], axis=-1)
        output, state = self.gru(x)
        output = self.Dense1(output)
        output = tf.reshape(output, (-1, output.shape[-1]))
        output = self.Dense2(output)
        return output, state, weights
    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [None]:
with strategy.scope():
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    def loss_function(target, output):
        mask = tf.math.logical_not(tf.math.equal(target, 0))
        loss = loss_object(target, output)
        mask = tf.cast(mask, dtype=loss.dtype)
        loss*=mask
        loss = tf.nn.compute_average_loss(loss, global_batch_size=Batch_size)
        return loss

In [None]:
with strategy.scope():
    @tf.function
    def train_step(img, target):
        loss=0
        hidden = decoder.reset_state(batch_size=img.shape[0])
        dec_input = tf.expand_dims([tokenizer.word_index['<start>']]*target.shape[0], 1)
        with tf.GradientTape() as tape:
            enc_output = encoder(img)
            for i in range(1, target.shape[1]):
                output, hidden, _ = decoder(enc_output, dec_input, hidden)
                loss+=loss_function(target[:, i], output)
                dec_input = tf.expand_dims(target[:, i],1)
        total_loss = loss/target.shape[1]
        trainable_variables = encoder.trainable_variables+decoder.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))
        return loss, total_loss
    def distributed_train_step(inputs):
        (img, target) = inputs
        loss = strategy.run(train_step, args=(img, target))
        return loss

In [None]:
with strategy.scope():
    valid_loss = tf.keras.metrics.Sum()
    @tf.function
    def val_step(img, target):
        loss=0
        hidden = decoder.reset_state(batch_size=img.shape[0])
        dec_input = tf.expand_dims([tokenizer.word_index['<start>']]*target.shape[0], 1)
        with tf.GradientTape() as tape:
            enc_output = encoder(img)
            for i in range(1, target.shape[1]):
                output, hidden, _ = decoder(enc_output, dec_input, hidden)
                loss+=loss_function(target[:, i], output)
                dec_input = tf.expand_dims(target[:, i], 1)
        batch_loss = loss/(target.shape[1])
        return loss, batch_loss
    
    @tf.function
    def distributed_val_step(inputs):
        (img, target) = inputs
        loss = strategy.run(val_step, args=(img, target))
        return loss

In [None]:
with strategy.scope():
    loss_record=[]
    val_loss_record=[]
    epochs=10
    encoder = Encoder(embed_dims)
    decoder = Decoder(units, embed_dims, vocab_size)
    for epoch in range(1, epochs):
        start = time.time()
        epoch_loss=0
        val_epoch_loss=0
        for (batch, (inputs_batch)) in enumerate(dataset):
            _, replica_loss = distributed_train_step(inputs_batch)
            total_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, replica_loss, axis=None)
            epoch_loss+=total_loss
            if batch%100 ==0:
                print('Train: Epoch {} Batch {} Loss {}'.format(epoch, batch, total_loss))
            loss_record.append(epoch_loss/num_steps)
        print('Epoch {} Loss {}'.format(epoch, epoch_loss))
        print('Total time used for 1 epoch {} sec\n'.format(time.time()-start))
        for (batch, (inputs_batch)) in enumerate(val_dataset):
            _, replica_loss = distributed_val_step(inputs_batch)
            total_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, replica_loss, axis=None)
            val_epoch_loss+=total_loss
            if batch%5 ==0:
                print('Val: Epoch {} Batch {} Loss {}'.format(epoch, batch, total_loss))
            val_loss_record.append(val_epoch_loss/val_num_steps)

In [None]:
def evaluate(image):
    attention = np.zeros((train_sequences.shape[1], 64))
    hidden = decoder.reset_state(batch_size=1)
    shape = image_processing(image).shape
    img_input = tf.expand_dims(image_processing(image),0)
    encoded_output = encoder(img_input)
    shape = tokenizer.word_index['<start>']
    decoder_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    generator = []
    for i in range(train_sequences.shape[1]):
        output, state, weights = decoder(encoded_output, decoder_input, hidden)
        attention[i] = tf.reshape(weights, (-1,)).numpy()
        output_id = tf.random.categorical(output, 1)[0][0]
        generator.append(tokenizer.index_word[output_id.numpy()])
        if tokenizer.index_word[output_id.numpy()]=='<end>':
            return generator, attention
        decoder_input = tf.expand_dims([output_id], 0)
        
    attention = attention[:len(generator),:]
    return generator, attention

In [None]:
test_list = list(set(image_caption.keys()).difference(set(keys)))

In [None]:
def random_generator():
    rid = np.random.randint(0, len(test_list))
    image = test_list[rid]
    real_caption = ','.join(image_caption[image])
    result, attention_plot = evaluate(image)
    print ('Real Caption:', real_caption)
    print('')
    print ('Prediction Caption:', ' '.join(result))
    image = tf.io.read_file(image)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.keras.preprocessing.image.array_to_img(image)
    return image

In [None]:
random_generator()

In [None]:
random_generator()

In [None]:
random_generator()

In [None]:
random_generator()

In [None]:
random_generator()

In [None]:
random_generator()

In [None]:
random_generator()

In [None]:
random_generator()

In [None]:
def plota(image, generator, attention):
    image_np = np.array(Image.open(image))
    fig = plt.figure(figsize=(10,10))
    len_gen = len(generator)
    for l in range(len_gen):
        temp_att = np.resize(attention[i], (8, 8))
        ax = fig.add_subplot(len_gen//2, len_gen//2, l+1)
        ax.set_title(result[l])
        img = ax.imshow(image_np)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()