In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.misc import imread, imresize

In [2]:
def set_x(image):
    return tf.transpose(set_y(image))


def set_y(image):
    [height, width, _] = image.get_shape().as_list()
    rows = list(range(height))
    rows = [tf.ones((width,)) * x for x in rows]
    return tf.stack(rows)

In [15]:
def get_attention_region_param(conv_layer):
    """output the three region proposal parameters, ie, tx, ty, tl"""
    anp_pool = tf.nn.max_pool(conv_layer, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1),
                            padding="SAME") # padding setting may not correct here
                                            # need investigation
    anp_pool_flat = tf.contrib.layers.flatten(anp_pool)
    
    # gaussian initializer is used here in the original work
    get_abc1 = tf.layers.dense(inputs=anp_pool_flat, units=1024, 
                               activation=tf.nn.tanh,
                              kernel_initializer=tf.contrib.layers.xavier_initializer()
                               )
    get_abc2 = tf.layers.dense(inputs=get_abc1, units=3,
                              activation=tf.nn.sigmoid,
                              kernel_initializer=tf.contrib.layers.xavier_initializer()
                              )
    # get448 = (0 +   448 * get_abc2)^1
    # the original image size is 448
    # get448 has three elements, tx, ty, tl
    get448 = tf.multiply(get_abc2, image_size)
    return get_448
    
def get_tx_ty_tl(param):
    return param[:, 0], param[:, 1], param[:, 2]

def get_corners(param):
    tx, ty, tl = param[0], param[1], param[2]
    tx_top_left = tf.maximum(tx - tl, 0.)
    ty_top_left = tf.maximum(ty - tl, 0.)
    tx_bottom_right = tf.minimum(tx + tl, 1.)
    ty_bottom_right = tf.minimum(ty + tl, 1.)
    return tx_top_left, ty_top_left, tx_bottom_right, ty_bottom_right
    

        
    
def get_attention_region(image, param, k=10):
    """get the attention region given image and tx, ty, tl"""
    # get tx, ty, tl, I am not sure that tensor indexing is working
    # this way, but I will use it for now
    # also the order of tx, ty, tl in param may be different
    tx, ty, tl = get_tx_ty_tl(param)
    tx_top_left, ty_top_left, tx_bottom_right, ty_bottom_right = get_corners(param)
    
    [_, height, width, _] = image.get_shape().as_list()
    tx_top_left = tf.map_fn(lambda x: tf.ones((height, width)) * x, tx_top_left)
    ty_top_left = tf.map_fn(lambda x: tf.ones((height, width)) * x, ty_top_left)
    tx_bottom_right = tf.map_fn(lambda x: tf.ones((height, width)) * x, tx_bottom_right)
    ty_bottom_right = tf.map_fn(lambda x: tf.ones((height, width)) * x, ty_bottom_right)
    
    h = lambda x: tf.divide(1.0, tf.add(1.0, tf.exp(tf.multiply(x, -k))))
    # construct the mask image, I will use the most simple and obvious 
    # implementation for now, which may not even be able to run on 
    # tensorflow
    # attention_mask = tf.zeros_like(image)
    # use layers to replace this
    mask_x = tf.map_fn(set_x, image)
    mask_y = tf.map_fn(set_y, image)
    
    mask_x1 = h(tf.subtract(mask_x, tx_top_left))
    mask_x2 = h(tf.subtract(mask_x, tx_bottom_right))
    
    mask_y1 = h(tf.subtract(mask_y, ty_top_left))
    mask_y2 = h(tf.subtract(mask_y, ty_bottom_right))
    
    mask_x = mask_x1 - mask_x2
    mask_y = mask_y1 - mask_y2
    attention_mask = tf.multiply(mask_x, mask_y)
    attention_mask = tf.stack([attention_mask, attention_mask, attention_mask], axis=3)
    
    # element wise multiplication of the original image and attention_mask
    return tf.multiply(image, attention_mask)



    
def crop_and_zoom(image_param):
    image = image_param[0]
    tx, ty, tl = image_param[1]
    out_image_size = image_param[2]
    tx_tl = tx - tl if (tx - tl) > 0 else 0
    ty_tl = ty - tl if (ty - tl) > 0 else 0
    tx_br = tx + tl if (tx + tl) < 1 else 1
    ty_br = ty + tl if (ty + tl) < 1 else 1
    
    
    
    
    
    
def attention_crop(images, params, out_image_size,name=None):
    with tf.name_scope(name, "AttentionCrop", [images, params, out_image_size]) as scope:
        images = tf.convert_to_tensor(images, name="images")
        params = tf.convert_to_tensor(params, name="params")
        crop_size = tf.convert_to_tensor([out_image_size, out_image_size], name="crop_size")
        # computation 
        boxes = tf.map_fn(get_corners, params)
        box_ind = tf.scan(lambda a, _: a + 1, params, -1)
        out_images = tf.image.crop_and_resize(images, boxes, box_ind, crop_size)
        return out_images

In [16]:
path = "./th.jpg"
image = imread(path)
image = imresize(image, (448, 448))
image_copy = image.copy()
#plt.imshow(image)
image = np.expand_dims(image, axis=0)
image = np.concatenate((image, image), axis=0)
print(image.shape)

(2, 448, 448, 3)


In [17]:
def neural_net_image_input(image_shape):
    x = tf.placeholder(tf.float32, shape=(None, 
                                         image_shape[0],
                                         image_shape[1],
                                         image_shape[2]))
    return x

image_input = neural_net_image_input([448, 448, 3])
attention_params = get_attention_region_param(image_input)

croped = attention_crop(image_input, attention_params, 350)

ValueError: The two structures don't have the same number of elements. First structure: <dtype: 'float32'>, second structure: (<tf.Tensor 'AttentionCrop_4/map/while/Maximum:0' shape=() dtype=float32>, <tf.Tensor 'AttentionCrop_4/map/while/Maximum_1:0' shape=() dtype=float32>, <tf.Tensor 'AttentionCrop_4/map/while/Minimum:0' shape=() dtype=float32>, <tf.Tensor 'AttentionCrop_4/map/while/Minimum_1:0' shape=() dtype=float32>).

In [6]:
#tf.reset_default_graph()
images = []
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    params = sess.run(corners, feed_dict = {image_input: image})
    #ys = sess.run(ys, feed_dict={image_input: image})
    #xs = sess.run(xs, feed_dict={image_input: image})
    attention_region = sess.run(attention_region, feed_dict={image_input:image})
    for i in range(attention_region.shape[0]):
        images.append(np.squeeze(attention_region[i]))

## Optimizer
The proposed recurrent attention CNN is optimized by two types of supervision, i.e., intra-scale classification loss and inter-scale pairwise ranking loss, for alternatively 
generating accurate region attention and learning more fine-grained features. Specifically, we minimize an objective function following a multi-task loss. The loss function for an image sample is defined as:

\begin{equation*}
L \left( X \right) = \sum_{s=1}^3 \left( L_{cls} \left( Y^{\left( s \right)}, Y^* \right) \right) + \sum_{s=1}^2 \left( L_{rank} \left( p_t^{\left(s \right)}, p_t^{\left(  s+1 \right)} \right) \right)
\end{equation*}

$p_t^{\left(s \right)}$ from pairwise ranking loss $L_{rank}$ denotes the prediction probability on the correct category labels t. Specifically, the lanking loss is given by:

\begin{equation*}
L_{rank} \left( p_t^{\left(s \right)}, p_t^{\left(s + 1 \right)}  \right) = max \left( 0, p_t^{\left(s \right)}  - p_t^{\left( s + 1\right)} + margin \right)
\end{equation*}

* Input images (at scale 1) and attended regions (at scale 2, 3) are resized to 448*448 and 224*224 pixels respectively in training, due to the smaller object size in the corse scale.

* we find that k in Eqn.(6) and the margin in Eqn.(9) are robust to optimization, thun we empirically set k as 10 and margin as 0.05.



## Training strategy
1. we initialize convolutional/classification layers in Figure 2 (b1 to b3 and c1 to c3) by the same pre-trained VGG network from ImageNet.
2. we consider a square (represented by tx, tx, tl) with the half length of the side of the original image. The square is selected by searching regions in the original image, with the highest response value in the last convolutional layer (i.e., conv5_4 in VGG-19). We can further obtain a smaller square by aalyzing convolutional responses at the second scale in a similar way. These selected squares are used to pre-train APN to obtain parameters in Figure 2 by learning the transformation from convolutional feature maps to {tx, ty, tl}.

3. we optimize the parameters in the above two steps in an alternative way. Specifically, we keep APN parameters unchanged, and optimize the softmax losses at three scales to converge. Then we fix parameters in convolutional/classification layers, and switch to ranking loss to optimize the two APNs. The learning process for the two parts is iterative, until the two types of losses no longer change. Besides, tl at each scale is constrained to be no less than one-third of the previous tl at coarse scale, to avoid the incompleteness of object structures when tl is too small.

## questions
1. Does the two APN share parameters or not?
2. If the two APN do not share parametrs, have they been trained as one in the pre-training process.
3. Does "we keep APN parameters unchanged, and optimize the softmax losses at three scales to converge." means that the classification layer for 3 scales are trained seperately, thus, error derivatives is not following through the APN network.
4. Optimize the two APNs seperately or together, ie, does the errer derivatives following through.
