# <center>Image Captioning <center/>

111062697 吳律穎

---

Requirements
1. You can use any model architectures (or any code above) you want, as long as accomplishing the goal.
2. You should train your own model architecture. In other words, except the feature extractor part, do not load the model or any pre-trained weights directly from other sources.
    - You can use pretrained inception_v3 as feature extractor; however, it's not recommended since inception is pretrained on ImageNet, where the image pattern is quite different to English words.
3. You should use the first 100,000 images as training data, the next 20,000 as validation data, and the rest (final 20,000) as testing data.
    - `spec_train_val`.txt contains the labels of only first 120,000 images.
4. Only if the whole word matches exactly does it count as correct.
5. You need to predict the answer to the testing data and write them in a file.
6. Your testing accuracy should be at least 90% in validation set.

# setup

In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import glob
import re
import numpy as np
import os
import time
from tqdm import tqdm

# Data preparing

### load data

In [3]:
all_captions = []
all_img_name_vector = []

txt_path = './dataset/words_captcha/spec_train_val.txt'
with open(txt_path, 'r') as fin:
    for line in fin:
        image_name, caption = line.strip().split()
        all_captions.append('<start> ' + ' '.join(caption) + ' <end>')
        all_img_name_vector.append(f'./dataset/words_captcha/{image_name}.png')
        
train_captions, img_name_vector = shuffle(all_captions, all_img_name_vector, random_state=1)


In [4]:
img_name_train, img_name_val = img_name_vector[:100000], img_name_vector[100000:]
cap_train, cap_val = train_captions[:100000], train_captions[100000:]

In [5]:
img_name_test = []
for i in range(120000, 140000):
    img_name_test.append(f'./dataset/words_captcha/a{i}.png')
# img_name_test = shuffle(img_name_test, random_state=1)

# for i in range(5):
#     print(img_name_test[i])
# print('------------------------------------------------------')
# for i in range(5):
#     print(img_name_train[i])
# print('------------------------------------------------------')
# for i in range(5):
#     print(img_name_val[i])
# print('------------------------------------------------------')

In [6]:
len(img_name_train), len(img_name_val), len(img_name_test)

(100000, 20000, 20000)

### tokenize the captions

In [7]:
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [8]:
print(train_captions[0])
print(cap_train[0])

<start> i r a q <end>
<start> i r a q <end>


In [9]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=" ", filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)

cap_train = tokenizer.texts_to_sequences(cap_train)
cap_train = tf.keras.preprocessing.sequence.pad_sequences(cap_train, padding='post')

cap_val = tokenizer.texts_to_sequences(cap_val)
cap_val = tf.keras.preprocessing.sequence.pad_sequences(cap_val, padding='post')

max_length_test = calc_max_length(cap_val)
max_length_train = calc_max_length(cap_train)
max_length = max(max_length_test, max_length_train)
print(max_length_train)
print(max_length_test)

print(max_length)


7
7
7


### parameter

In [10]:
BATCH_SIZE = 50
BUFFER_SIZE = 5000
embedding_dim = 256
units = 512
vocab_size = len(tokenizer.word_index)+1
STEPS = len(img_name_train) // BATCH_SIZE
EPOCHS = 10
LEARNING_RATE = 1e-4

### build dataset

In [11]:
def map_func(img_name, cap):
    img = tf.io.read_file(img_name)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, (244, 244))
    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img, cap

In [12]:
dataset_train = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\
                               .map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
                               .shuffle(BUFFER_SIZE)\
                               .batch(BATCH_SIZE, drop_remainder=True)\
                               .prefetch(tf.data.experimental.AUTOTUNE)

dataset_val = tf.data.Dataset.from_tensor_slices((img_name_val, cap_val))\
                               .map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
                               .shuffle(BUFFER_SIZE)\
                               .batch(BATCH_SIZE, drop_remainder=True)\
                               .prefetch(tf.data.experimental.AUTOTUNE)

# Model

## Restnet50 Encoder

In [13]:
image_model = tf.keras.applications.ResNet50(include_top=False, weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

feature_extractor = tf.keras.Model(new_input, hidden_layer)

In [14]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)
        self.feature_extractor = feature_extractor

    def call(self, x):
        # x shape after passing through fc == (batch_size, 15, embedding_dim)
        x = self.feature_extractor(x)
        x = tf.reshape(x, (x.shape[0], -1, x.shape[3]))
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [15]:
encoder = CNN_Encoder(embedding_dim)

## RNN Decoder

In [16]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [17]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform")
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [18]:
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

# Train Model

In [19]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [20]:
checkpoint_path = './checkpoints/Resnet50V2/'
ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [21]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial()

In [22]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    with tf.GradientTape() as tape:
        features = encoder(img_tensor)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    mean_loss = (loss / int(target.shape[1]))

    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return mean_loss

In [23]:
def predict(img_tensor):
    batch_size = img_tensor.shape[0]
    hidden = decoder.reset_state(batch_size=batch_size)
    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * batch_size, 1)
    features = encoder(img_tensor)
    
    result = tf.expand_dims([tokenizer.word_index['<start>']] * batch_size, 1)
    for _ in range(max_length-1): #前面有start了，所以只需max_length-1
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions, axis=1).numpy()
        dec_input = tf.expand_dims(predicted_id, 1)
        result = tf.concat([result, predicted_id.reshape((batch_size, 1))], axis=1)

    return result

In [24]:
def postprocess(segs):
    result_list = []
    for seq in segs:
        result = ''
        for s in seq[1:]:
            if s == tokenizer.word_index['<end>']:
                break
            result += tokenizer.index_word[s]
        result_list.append(result)
    return result_list

In [25]:
def evaluate(dataset_valid):
    sample_count = 0
    correct_count = 0
    for img_tensor, target in dataset_valid:
        pred_list = postprocess(predict(img_tensor).numpy())
        real_list = postprocess(target.numpy())
        for pred, real in zip(pred_list, real_list):
            sample_count += 1
            if pred == real:
                correct_count += 1

    return correct_count / sample_count

In [26]:
start = time.time()
loss_plot = []
for epoch in range(start_epoch, EPOCHS):
    loss = 0
    pbar = tqdm(dataset_train, total=STEPS, desc=f'Epoch {epoch + 1:2d}')
    for (step, (img_tensor, target)) in enumerate(pbar):
        loss += train_step(img_tensor, target)
        pbar.set_postfix({'loss': loss.numpy() / (step + 1)})

    loss_plot.append(loss / STEPS)
    ckpt_manager.save()

    score = evaluate(dataset_val)
    print(f'Validation accuracy: {score:.2f}')
    
print('Time taken for {} epoch {} sec\n'.format(EPOCHS - start_epoch, time.time() - start))

Epoch  1: 100%|██████████| 2000/2000 [11:25<00:00,  2.92it/s, loss=0.609]


Validation accuracy: 0.95


Epoch  2: 100%|██████████| 2000/2000 [12:07<00:00,  2.75it/s, loss=0.0291]


Validation accuracy: 0.96


Epoch  3: 100%|██████████| 2000/2000 [12:30<00:00,  2.66it/s, loss=0.0189]


Validation accuracy: 0.97


Epoch  4: 100%|██████████| 2000/2000 [12:55<00:00,  2.58it/s, loss=0.0165]


Validation accuracy: 0.84


Epoch  5: 100%|██████████| 2000/2000 [12:55<00:00,  2.58it/s, loss=0.0131]


Validation accuracy: 0.99


Epoch  6: 100%|██████████| 2000/2000 [13:29<00:00,  2.47it/s, loss=0.0126] 


Validation accuracy: 0.98


Epoch  7: 100%|██████████| 2000/2000 [13:47<00:00,  2.42it/s, loss=0.00721]


Validation accuracy: 0.94


Epoch  8: 100%|██████████| 2000/2000 [13:57<00:00,  2.39it/s, loss=0.00822]


Validation accuracy: 0.99


Epoch  9: 100%|██████████| 2000/2000 [13:58<00:00,  2.39it/s, loss=0.00698]


Validation accuracy: 0.99


Epoch 10: 100%|██████████| 2000/2000 [13:59<00:00,  2.38it/s, loss=0.00576]


Validation accuracy: 0.98
Time taken for 10 epoch 8788.310842752457 sec



# Predict Testing Data

In [27]:
ckpt.restore('./checkpoints/Resnet50V2/ckpt-9').expect_partial()

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x23168441990>

In [28]:
def map_func_test(img_name):
    img = tf.io.read_file(img_name)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, (244, 244))
    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img, img_name

In [29]:
dataset_test = tf.data.Dataset.from_tensor_slices((img_name_test))\
                               .map(map_func_test, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
                               .batch(BATCH_SIZE, drop_remainder=True)\
                               .prefetch(tf.data.experimental.AUTOTUNE)

In [30]:
import re

with open('./Lab12-2_111062697.txt', 'w') as fout:
    for step, (img_tensor, img_name) in enumerate(tqdm(dataset_test)):
        pred_list = postprocess(predict(img_tensor).numpy())
        for path, pred in zip(img_name, pred_list):
            path = path.numpy().decode('utf-8')
            name = re.search('(a[0-9]+)', path).group(1)
            fout.write(f'{name} {pred}\n')

100%|██████████| 400/400 [01:23<00:00,  4.78it/s]


## Report

我以 ResNet50 作為 feature extractor，並使用 pretrained on ImageNet 的 weight，來當 pretrain weight，其餘的部分則與助教的相同。由於看到 Requirements 中說 ImageNet 作為 pretrain weight 效果不好，所以將 ResNet50 的 trainable_variables 也納入需要訓練並更新的參數中。從訓練結果可以看到，在第一個 Epoch 就有 0.95 的 Validation accuracy，第五個 Epoch 便達到 0.99 ，最後我選擇了同樣是 0.99 的第十個 Epoch 來產生 testset 的結果。