In [None]:
import tensorflow as tf
import cv2
import numpy as np
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
layer_pos_dict = {"conv1_2" : 2, "conv2_2" : 5, "conv3_2" : 8, "conv4_2" : 12, "conv5_2" : 16}
layer_pick = "conv5_2"

In [None]:
img = cv2.imread("../../data/images/VanGogh.jpg", 1)
img = cv2.resize(img, (224, 224))

In [None]:
IMAGENET_MEANS = [103.939, 116.779, 123.68]

def process_image(img):
    processed_image = np.array(img).astype(np.float32)
    for x in range(3):
        processed_image[:, :, x] -= IMAGENET_MEANS[x]
    return processed_image
        
def restore_image(img):
    restored_image = np.array(img)
    for x in range(3):
        restored_image[:, :, x] += IMAGENET_MEANS[x]
    restored_image.clip(0, 255)
    return restored_image.astype(np.uint8)

## Broken approach

The section below does not really work: the problem arises when I try to rechain the layers from vgg16 model (i.e. the outputs of intermediary layers are different when generated with the same block of code and the same fixed input) - the problem may lie in the fact that rechaining keras layers results in them having several input and output nodes - and tensorflow may get confused collecting them. I decided to leave the code as it is in case I ever would like to investigate that behaviour further.

In [None]:
vgg16 = tf.contrib.keras.applications.vgg16.VGG16(include_top=False, weights='imagenet', input_tensor=None, input_shape=None)
for layer in vgg16.layers:
    layer.trainable = False

In [None]:
vgg16.summary()

In [None]:
target_image = tf.placeholder(tf.float32, shape=(224, 224, 3), name="target_image")
recovered_image = tf.Variable(tf.random_normal([224, 224, 3]), name="recovered_image")

In [None]:
def get_embedding(image):
    last_layer = tf.expand_dims(image, axis=0)
    for i in range(1, layer_pos_dict[layer_pick] + 1):
        next_layer = vgg16.layers[i](last_layer)
        last_layer = next_layer
    return last_layer

In [None]:
def gram_matrix(embedding):
    filters_first = tf.transpose(embedding, perm=[3, 0, 1, 2])
    filters_flatten = tf.contrib.keras.backend.batch_flatten(filters_first)
    gram = tf.matmul(filters_flatten, filters_flatten, transpose_b=True)
    return gram

In [None]:
target_gram = gram_matrix(get_embedding(target_image))
recovered_gram = gram_matrix(get_embedding(recovered_image))

In [None]:
style_loss = tf.reduce_sum(tf.squared_difference(target_gram, recovered_gram))

In [None]:
adam = tf.train.AdamOptimizer(1e-4).minimize(style_loss)

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    feed_dict = {target_image : process_image(img)}
    sess.run(adam, feed_dict=feed_dict)

## Working approach

This section is supposed to properly reconstruct gram matrices for filters inside convolutional layers of vgg16. The main difference from broken section lies in specifying input tensor to vgg16 model - this way keras layers inside it have only one input and output nodes and I can properly extract the embeddings I need.

In [None]:
target_image = tf.constant(process_image(img))
recovered_image = tf.Variable(tf.random_normal([1, 224, 224, 3]), name="recovered_image", trainable=True)
concatenated_input = tf.concat([tf.expand_dims(target_image, axis=0), recovered_image], axis=0)

In [None]:
vgg16 = tf.contrib.keras.applications.vgg16.VGG16(include_top=False, weights='imagenet', 
                                                  input_tensor=concatenated_input, input_shape=None)
for layer in vgg16.layers:
    layer.trainable = False

In [None]:
embeddings = vgg16.layers[layer_pos_dict[layer_pick]].output
target_embeddings = embeddings[0, :, :, :]
recovered_embeddings = embeddings[1, :, :, :]

In [None]:
def gram_matrix(embedding):
    filters_first = tf.transpose(embedding, perm=[2, 0, 1])
    filters_flatten = tf.contrib.keras.backend.batch_flatten(filters_first)
    gram = tf.matmul(filters_flatten, filters_flatten, transpose_b=True)
    return gram

target_gram = gram_matrix(target_embeddings)
recovered_gram = gram_matrix(recovered_embeddings)

In [None]:
embedding_shape = embeddings.get_shape().as_list()
layer_width = embedding_shape[1]
layer_height = embedding_shape[2]
n_filters = embedding_shape[3]
style_norm = 4 * (n_filters * layer_width * layer_height) ** 2
style_loss = tf.reduce_sum(tf.squared_difference(target_gram, recovered_gram)) / style_norm

In [None]:
def total_variation_loss(x):
    width, height = 224, 224
    width_offset = tf.square(x[:, :width - 1, :height - 1, :] - x[:, 1:, :height - 1, :])
    height_offset = tf.square(x[:, :width - 1, :height - 1, :] - x[:, :width - 1, 1:, :])
    return tf.reduce_sum(width_offset + height_offset)

# for conv3_2 layer: lr=3e+1, beta=10^(-8)
beta = 10 ** (-8)
total_loss = style_loss + beta * total_variation_loss(recovered_image)
adam = tf.train.AdamOptimizer(3e+1).minimize(total_loss, var_list=[recovered_image])

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(10):
        current_style_loss, current_total_loss, _ = sess.run([style_loss, total_loss, adam])
        print(i, current_style_loss, current_total_loss)
    final_image = recovered_image.eval()[0, :, :, :]

In [None]:
plt.figure(figsize=(15,15))
plt.subplot(1, 2, 1)
plt.imshow(restore_image(final_image)[:, :, [2, 1, 0]])

plt.subplot(1, 2, 2)
plt.imshow(img[:, :, [2, 1, 0]])