<h3>What is image caption?</h3>
<p>簡單來說就是「給機器一張圖，機器會輸出一段文字來描述這張圖」</p>

步驟如下:
1. 準備圖片與文字資料
 * 圖片:load images、resize image、normalization
 * 文字:tokenizer、create dictionary、sequence padding、
 * 將資料分成train、val和test
 * 資料分段 batch size

2. Model
 * CNN: feature extract、


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense,Activation, Dropout, Input
from keras.layers import Conv2D,MaxPool2D
from keras.utils import plot_model

import tensorflow.contrib.layers as layers


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


"""
CNN可按照喜好使用vgg16、vgg19、resnet50、inception...等，已被train好的模型，這裡將使用vgg16。
"""

# 載入CNN Model as Encoder
1. 用tensorflow手刻模型
2. 用Keras直接載入

In [2]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Conv2D, MaxPool2D
from keras.utils import plot_model

from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

In [3]:
# tensorflow 手刻
batch_size = 32
image_shape = [224,224,3]

kernel_size = (3,3)
strides = (1,1)

images=tf.placeholder(tf.float32, shape= [batch_size] + image_shape) # image_shape = [224,224,3]

with tf.variable_scope("vgg16"):
    conv1_1_feats = tf.layers.conv2d(images, 64, kernel_size, strides, padding ='same', 
                                     activation =tf.nn.relu, use_bias = True, 
                                     name = 'conv1_1')
    conv1_2_feats = tf.layers.conv2d(conv1_1_feats, 64, kernel_size, strides, padding ='same', 
                                     activation =tf.nn.relu, use_bias = True, 
                                     name = 'conv1_2')
    pool1_feats = tf.layers.max_pooling2d(conv1_2_feats, pool_size=2, strides=2, name = 'pool1')

    conv2_1_feats = tf.layers.conv2d(pool1_feats, 128, kernel_size, strides, padding ='same', 
                                     activation =tf.nn.relu, use_bias = True, 
                                     name = 'conv2_1')
    conv2_2_feats = tf.layers.conv2d(conv2_1_feats, 128,kernel_size, strides, padding ='same', 
                                     activation =tf.nn.relu, use_bias = True, 
                                     name = 'conv2_2')
    pool2_feats = tf.layers.max_pooling2d(conv2_2_feats, pool_size=2, strides =2, name = 'pool2')

    conv3_1_feats = tf.layers.conv2d(pool2_feats, 256, kernel_size, strides, padding ='same', 
                                     activation =tf.nn.relu, use_bias = True, 
                                     name = 'conv3_1')
    conv3_2_feats = tf.layers.conv2d(conv3_1_feats, 256, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv3_2')
    conv3_3_feats = tf.layers.conv2d(conv3_2_feats, 256, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv3_3')
    pool3_feats = tf.layers.max_pooling2d(conv3_3_feats, pool_size=2, strides =2, name = 'pool3')

    conv4_1_feats = tf.layers.conv2d(pool3_feats, 512, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv4_1')
    conv4_2_feats = tf.layers.conv2d(conv4_1_feats, 512, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv4_2')
    conv4_3_feats = tf.layers.conv2d(conv4_2_feats, 512, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv4_3')
    pool4_feats = tf.layers.max_pooling2d(conv4_3_feats, pool_size=2, strides =2, name = 'pool4')

    conv5_1_feats = tf.layers.conv2d(pool4_feats, 512, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv5_1')
    conv5_2_feats = tf.layers.conv2d(conv5_1_feats, 512, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv5_2')
    conv5_3_feats = tf.layers.conv2d(conv5_2_feats, 512, kernel_size, strides, padding ='same', activation =tf.nn.relu, use_bias = True, name = 'conv5_3')

In [4]:
reshaped_conv5_3_feats = tf.reshape(conv5_3_feats,[batch_size, 196, 512])

In [5]:
# 用 keras 載入模型
image_model = tf.keras.applications.VGG16(include_top=False, 
                                                weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

# reshaped_feats = tf.reshape(hidden_layer,(32,196,512))
# reshaped_feats

<h3>RNN Model</h3>

In [6]:
vocab_size = 5000
dim_embedding = 512
max_caption_length = 20

num_lstm_units = 512
vocabulary_size = 5000

num_ctx = 196 # 有196個context vector，每一張圖萃取出196個region，每一個region用一個vector表示
dim_ctx = 512 

fc_drop_rate = 0.5
lstm_drop_rate = 0.3
attention_loss_factor = 0.01


In [7]:
fc_kernel_initializer_scale = 0.08
fc_kernel_initializer = tf.random_uniform_initializer(
            minval = -fc_kernel_initializer_scale,
            maxval = fc_kernel_initializer_scale)

is_train = True

fc_kernel_regularizer_scale = 1e-4
if is_train and fc_kernel_regularizer_scale > 0:
    fc_kernel_regularizer = tf.contrib.layers.l2_regularizer(scale = fc_kernel_regularizer_scale)
else:
    fc_kernel_regularizer = None
    

# activity_regularizer = tf.contrib.layers.l1_regularizer(scale = 0.0, scope = None)

In [8]:
conv_feats = reshaped_conv5_3_feats
contexts = conv_feats
sentences = tf.placeholder(dtype=tf.int32, 
                           shape=[batch_size, max_caption_length]) # 32 * 20
masks = tf.placeholder(dtype=tf.float32, 
                       shape=[batch_size, max_caption_length]) # 32 * 20

last_memory = tf.placeholder(
    dtype=tf.float32,
    shape=[batch_size, num_lstm_units]) # 32 * 512

last_output = tf.placeholder(
    dtype=tf.float32,
    shape=[batch_size, num_lstm_units]) # 32 * 512

last_word = tf.placeholder(
    dtype=tf.int32,
    shape=[batch_size]) # 32

In [9]:
"""以context_mean來初始化"""
def initialize(cont_mean):
    context_mean = tf.layers.dropout(inputs = cont_mean, rate = fc_drop_rate, training = is_train)
    ##fc_drop_rate = 0.5;is_train = True
    memory = tf.layers.dense(cont_mean, units=num_lstm_units,
                             activation = None,
                             use_bias = True,
                             trainable = is_train,
                             activity_regularizer = None)
                           
    output = tf.layers.dense(cont_mean,
                           units=num_lstm_units,
                           activation=None,
                           use_bias = True,
                           trainable = is_train,
                           activity_regularizer = None)
    return memory, output


In [10]:
# attention
"""
1. calculate match score
2. put match score into softmax layer to obtain alpha (seen as probability)

contexts = conv_feats
維度為 32 x 196 x 512 

output 是 last output
維度為 32 * 512
"""

def attend(contexts, output):
    reshaped_context = tf.reshape(contexts, [-1,dim_ctx]) # 6272 * 512
    reshaped_context = tf.layers.dropout(reshaped_context, 
                                      rate = fc_drop_rate)
    output = tf.layers.dropout(output, fc_drop_rate) # 32 * 512
  
    logits1 = tf.layers.dense(reshaped_context, 
                           units = 1,
                           activation = None,
                           use_bias = False)
                            # after shape 6272 * 1
    logits1 = tf.reshape(logits1, [-1, num_ctx]) #  32 * 196
  
    logits2 = tf.layers.dense(output, 
                           units = num_ctx, 
                           activation = None,
                           use_bias = False) #  32 * 196
    logits = logits1 + logits2 # 32 * 196
  
    alpha = tf.nn.softmax(logits) # 32 * 196
  
    return alpha # 32 * 196
  

In [11]:
def decode(expanded_output):
    """ Decode the expanded output of the LSTM into a word. """
    expanded_output = tf.layers.dropout(expanded_output)
 
    logits = tf.layers.dense(expanded_output,
                               units = vocabulary_size,
                               activation = None,
                               name = 'fc')
    return logits

In [12]:
with tf.variable_scope('word_embedding'):
    embedding_matrix = tf.get_variable(shape=[vocab_size,dim_embedding],
                                    initializer=fc_kernel_initializer,                                    
                                    trainable=is_train,
                                    name = 'weights')

In [13]:
lstm = tf.nn.rnn_cell.LSTMCell(
            num_lstm_units,
            initializer=fc_kernel_initializer)

lstm = tf.nn.rnn_cell.DropoutWrapper(lstm,
      input_keep_prob=1.0 -lstm_drop_rate,
      output_keep_prob=1.0 - lstm_drop_rate,
      state_keep_prob=1.0 - lstm_drop_rate)


In [14]:
# Initialize the LSTM using the mean context
with tf.variable_scope("initialize"):
#     context_mean = tf.reduce_mean(conv_feats, axis=1) # after shape 32 * 512
    initial_memory, initial_output = initialize(tf.reduce_mean(conv_feats, axis=1))
    initial_state = initial_memory, initial_output # 32 * 512

In [15]:
# Prepare to run
predictions = []

alphas = []
cross_entropies = []
predictions_correct = []
num_steps = max_caption_length
last_output = initial_output # 把初始化的output當成上一步的output
last_memory = initial_memory # 初始化的memory當作上一步的memory
last_word = tf.zeros([batch_size], tf.int32) # 上一步輸出的詞彙

    
last_state = last_memory, last_output # 上一個cell的狀態為tuple (last_memory, last_output)


In [18]:
# 每一句有20個字, num_steps＝20
"""
1. 丟進attention，得到masked_alpha值 
2. 透過查找embedding_matrix，找到上一個字的word vector
3. 丟進lstm
"""
for idx in range(num_steps):
  # Attention Mechanism
  with tf.variable_scope('attend', reuse=tf.AUTO_REUSE) as scope:

    alpha = attend(contexts, last_output)
    """attention的第三個步驟： 
    contexts shape == 32 * 196 * 512; 
    alpha shape == 32 * 196
    alpha擴展第三個維度 ＝＝ (32*196*1)
    將alpha值乘上 contexts
    """
    context = tf.reduce_sum(contexts * tf.expand_dims(alpha,2), axis = 1) # after shape 32 * 512
    
    tiled_masks = tf.tile(tf.expand_dims(masks[:, idx], 1),
                          [1, 196])
    masked_alpha = alpha * tiled_masks
    alphas.append(tf.reshape(masked_alpha, [-1]))
    
  # Embed the last word
  with tf.variable_scope("word_embedding"):
    word_embed = tf.nn.embedding_lookup(embedding_matrix,
                                          last_word)
  
  # Apply the LSTM
  """
  將上一個字和context
  """
  with tf.variable_scope("lstm"):
    current_input = tf.concat([context, word_embed], 1)
    output, state = lstm(current_input, last_state)
    memory, _ = state

  # Decode the expanded output of LSTM into a word
  with tf.variable_scope("decode", reuse=tf.AUTO_REUSE) as de_scope:
#       de_scope.reuse_variables()
    expanded_output = tf.concat([output,
                                   context,
                                   word_embed],
                                  axis=1)
    logits = decode(expanded_output)
    probs = tf.nn.softmax(logits)

    prediction = tf.argmax(logits, 1)
    predictions.append(prediction)

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=sentences[:, idx],
          logits=logits)
    masked_cross_entropy = cross_entropy * masks[:, idx]
    cross_entropies.append(masked_cross_entropy)

    ground_truth = tf.cast(sentences[:, idx], tf.int64)
    prediction_correct = tf.where(
          tf.equal(prediction, ground_truth),
          tf.cast(masks[:, idx], tf.float32),
          tf.cast(tf.zeros_like(prediction), tf.float32))
    predictions_correct.append(prediction_correct)

    last_output = output
    last_memory = memory
    last_state = state
    last_word = sentences[:, idx]

    tf.get_variable_scope().reuse_variables() 
    cross_entropies = tf.stack(cross_entropies, axis=1)
    cross_entropy_loss = tf.reduce_sum(cross_entropies) / tf.reduce_sum(masks)

    alphas = tf.stack(alphas, axis=1)
    alphas = tf.reshape(alphas, [batch_size, num_ctx, -1])
    attentions = tf.reduce_sum(alphas, axis=2)
    diffs = tf.ones_like(attentions) - attentions
    attention_loss = attention_loss_factor \
                 * tf.nn.l2_loss(diffs) \
                 / (batch_size * num_ctx)

    reg_loss = tf.losses.get_regularization_loss()

    total_loss = cross_entropy_loss + attention_loss + reg_loss

    predictions_correct = tf.stack(predictions_correct, axis=1)
    accuracy = tf.reduce_sum(predictions_correct) \
           / tf.reduce_sum(masks)
    
    contexts = contexts

    sentences = sentences
    masks = masks
    total_loss = total_loss
    cross_entropy_loss = cross_entropy_loss
    attention_loss = attention_loss
    reg_loss = reg_loss
    accuracy = accuracy
    attentions = attentions

    initial_memory = initial_memory
    initial_output = initial_output
    last_memory = last_memory
    last_output = last_output
    last_word = last_word
    memory = memory
    output = output
    probs = probs

In [None]:
summary_dir = './summary/'
num_epochs = 100
num_batches = 32

def train(sess, train_data):
    if not os.path.exists(summary_dir):
        os.mkdir(summary_dir)
        train_writer = tf.summary.FileWriter(summary_dir,
                                             sess.graph)
        for _ in tqdm(list(range(num_epochs)), desc='epoch'):
            for _ in tqdm(list(range(train_data.num_batches)), desc='batch'):
                batch = train_data.next_batch()
                image_files, sentences, masks = batch
                images = self.image_loader.load_images(image_files)
                feed_dict = {self.images: images,
                             self.sentences: sentences,
                             self.masks: masks}
                _, summary, global_step = sess.run([self.opt_op,
                                                    self.summary,
                                                    self.global_step],
                                                    feed_dict=feed_dict)
                if (global_step + 1) % config.save_period == 0:
                    self.save()
                train_writer.add_summary(summary, global_step)
            train_data.reset()

        self.save()
        train_writer.close()
        print("Training complete.")


In [None]:
with tf.Session() as sess:
#     data = prepare_train_data(config)
    model = CaptionGenerator(config)
    sess.run(tf.global_variables_initializer())
#     if FLAGS.load:
#         model.load(sess, FLAGS.model_file)
#     if FLAGS.load_cnn:
#         model.load_cnn(sess, FLAGS.cnn_model_file)
    tf.get_default_graph().finalize()
    model.train(sess, data)
    coco, data_reward, vocabulary = data_for_reward(config)
    model.train_eval(sess, coco, data_reward, vocabulary)