# Extract Local Patches

* Keep color
* Patches of mxm pixels

In [None]:
import caffe
import lmdb
from PIL import Image
import cv2
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os, time
import datetime
import random

video_files = os.listdir("videoclips")
annotation_files = os.listdir("annotations")

video_files.remove("example.avi")

def log(string):
    print "%s: %s" % (str(datetime.datetime.now())[:-7], string)
    
proportion = 1.0

with open("annotations/train_clean.txt") as fp:
    train_clean = fp.readlines()
    train_clean = [f[:-2] for f in train_clean]
    print len(train_clean)
    train_clean = np.random.choice(train_clean, int(len(train_clean) * proportion))

with open("annotations/test_clean.txt") as fp:
    test_clean = fp.readlines()
    test_clean = [f[:-2] for f in test_clean]
    test_clean = np.random.choice(test_clean, int(len(test_clean) * proportion))
    
with open("annotations/train_auto.txt") as fp:
    train_auto = fp.readlines()
    train_auto = [f[:-2] for f in train_auto]
    train_auto = np.random.choice(train_auto, int(len(train_auto) * proportion))

height = width = 100
    
def get_local_patches(fname, resize=(100, 100), patch_size=(50, 50), stride=1, trim=None, show=False):
    cap = cv2.VideoCapture(fname)
    patches = []
    
    num = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame = frame / 255.
        dim = resize
        # perform the actual resizing of the image and show it
        frame = cv2.resize(frame, dim, interpolation = cv2.INTER_AREA)
        # Greyscaling
        frame = np.mean(frame, axis=2)
        
        for height_idx in range(0, frame.shape[0] - patch_size[0] - 1, stride):
            for width_idx in range(0, frame.shape[1] - patch_size[1] - 1, stride):
                num += 1
                patch = frame[height_idx:height_idx + patch_size[0], width_idx:width_idx + patch_size[1]]
                if len(patch.shape) == 2:
                    patch = patch[:,:,np.newaxis]
                if np.mean(patch) >= 0.1 and patch.var() > 0.01:
                    patches.append(patch)
        
#         for height_batch_i in range(frame.shape[0] // patch_size[0]):
#             for width_batch_i in range(frame.shape[1] // patch_size[1]):
#                 patch = frame[height_batch_i * patch_size[0]:(height_batch_i + 1) * patch_size[0],
#                              width_batch_i * patch_size[1]:(width_batch_i + 1) * patch_size[1],:]
#                 if np.mean(patch) >= 0.1: # Avoid very dark images
#                     patches.append(patch)

    log("Number of patches: %i" % len(patches))

    if show:
        f1 = patches[random.sample(range(len(patches)),1)[0]]
        cv_rgb = f1
        log("Shape: %s" % str(cv_rgb.shape))
        log("Mean of this image is %.2f" % np.mean(f1))
        log("Variance of this image is %.2f" % f1.var())
        plt.imshow(np.mean(cv_rgb, axis=2), cmap="Greys_r")
        plt.show()

    cap.release()
    cv2.destroyAllWindows()

    if trim is not None and len(patches) > trim:
        orig = len(patches)
        # Trim should < 0.5
#         trim = int(len(patches) * trim)
#         patches = patches[trim:-trim]
        trim_idx = np.random.choice(range(len(patches)), trim)
        temp = [patches[idx] for idx in trim_idx]
        patches = temp
        log("Trimmed from %i to %i patches" % (num, len(patches)))
    
    return patches

In [None]:
video = train_clean[0].split("\"")[1]
patches = get_local_patches("videoclips/" + video, patch_size=(80, 80), trim=1000, resize=(200, 200),
                            stride = 20, show=True)

In [None]:
trim = 1000
resize = (200, 200)
patch_size = (80, 80)
stride = 20

all_patches = []

log("%i files in total" % len(train_clean))

for idx, f in enumerate(train_clean):
    video = f.split("\"")[1]
    patches = get_local_patches("videoclips/" + video, 
                    patch_size=patch_size, resize=resize, stride=stride, trim=trim)
#     if len(patches[0].shape) == 2:
#         # Convert to 3-d data
#         patches = np.array([patch[:,:,np.newaxis] for patch in patches])
    
    log("Video %i, added %i patches" % (idx, len(patches)))
    
    all_patches = all_patches + patches

In [None]:
# all_patches = np.array(all_patches)
# print all_patches.shape

from sklearn.cross_validation import train_test_split

X_train, X_test, _, _ = train_test_split(all_patches, 
                                [0 for i in range(len(all_patches))], test_size=0.6)

X_train = np.array(X_train)
print X_train.shape

In [None]:
plt.imshow(np.mean(X_train[0], axis=2), cmap="Greys_r")

In [None]:
import tensorflow as tf
import math

height, width = patch_size

def autoencoder(input_shape=[None, height, width, 1],
                n_filters=[1, 3, 3, 3, 3, 3],
                filter_sizes=[3, 3, 3, 3, 3],
                corruption=False):
    # %%
    # input to the network
    x = tf.placeholder(tf.float32, input_shape, name='x')


    # %%
    # ensure 2-d is converted to square tensor.
    if len(x.get_shape()) == 2:
        x_dim = np.sqrt(x.get_shape().as_list()[1])
        if x_dim != int(x_dim):
            raise ValueError('Unsupported input dimensions')
        x_dim = int(x_dim)
        x_tensor = tf.reshape(
            x, [-1, x_dim, x_dim, n_filters[0]])
    elif len(x.get_shape()) == 4:
        x_tensor = x
    else:
        raise ValueError('Unsupported input dimensions')
    current_input = x_tensor

    # %%
    # Build the encoder
    encoder_weights = []
    encoder_ops = []
    shapes = []
    for layer_i, n_output in enumerate(n_filters[1:]):
        n_input = current_input.get_shape().as_list()[3]
        shapes.append(current_input.get_shape().as_list())
        W = tf.Variable(
            tf.random_uniform([
                filter_sizes[layer_i],
                filter_sizes[layer_i],
                n_input, n_output],
                -1.0 / math.sqrt(n_input),
                1.0 / math.sqrt(n_input)))
        b = tf.Variable(tf.zeros([n_output]))
        encoder_weights.append(W)
        output = tf.nn.sigmoid(
            tf.add(tf.nn.conv2d(
                current_input, W, strides=[1, 1, 1, 1], padding='SAME'), b))
        encoder_ops.append(output)
        current_input = output

    # %%
    # store the latent representation
    z = current_input
    encoder_weights.reverse()
    shapes.reverse()

    # %%
    # Build the decoder using the same weights
    for layer_i, shape in enumerate(shapes):
        W = encoder_weights[layer_i]
        b = tf.Variable(tf.zeros([W.get_shape().as_list()[2]]))
        output = tf.nn.sigmoid(
            tf.add(tf.nn.conv2d_transpose(
                current_input, W,
                tf.pack([tf.shape(x)[0], shape[1], shape[2], shape[3]]),
                strides=[1, 1, 1, 1], padding='SAME'), b))
        current_input = output
        
    decoder = current_input

    # %%
    # now have the reconstruction through the network
    y = current_input
    # cost function measures pixel-wise difference
    cost = tf.reduce_sum(tf.square(y - x_tensor))

    # %%
    return {'x': x, 'z': z, 'y': y, 'cost': cost, 
            "encoder": encoder_ops, "decoder": decoder}


# %%
def test_hollywood(X_train, X_test, n_filters, filter_sizes):
    import tensorflow as tf
    ae = autoencoder(n_filters=n_filters, filter_sizes=filter_sizes)

    # %%
    learning_rate = 0.001
    optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(ae['cost'])

    # %%
    # We create a session to use the graph
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    # %%
    # Fit all training data
    batch_size = 100
    n_epochs = 50
    step_size = 10
    for epoch_i in range(n_epochs):
        for batch_i in range(X_train.shape[0] // batch_size):
            batch_xs = X_train[batch_i * batch_size:(batch_i + 1) * batch_size]
            train = batch_xs
            sess.run(optimizer, feed_dict={ae['x']: train})
        if epoch_i % step_size == 0:
            print(str(datetime.datetime.now()), epoch_i, sess.run(ae['cost'], feed_dict={ae['x']: train}))

#     # %%
#     # Plot example reconstructions
#     n_examples = 5
#     test_xs = np.array(X_test[:n_examples])
#     test_xs_norm = test_xs
#     recon = sess.run(ae['y'], feed_dict={ae['x']: test_xs_norm})
#     print test_xs.shape
#     print(recon.shape)
#     fig, axs = plt.subplots(2, n_examples, figsize=(30, 15))
#     for example_i in range(n_examples):
#         axs[0][example_i].imshow(
#             test_xs[example_i, :].reshape((height, width, 3))[:,:,::-1])
#         axs[0][example_i].axis("off")

#         axs[1][example_i].imshow(
#             recon[example_i, ...].reshape((height, width, 3))[:,:,::-1])
# #                 np.reshape(recon[example_i, ...], (height, width, 3)))
#         axs[1][example_i].axis("off")
    
#     fig.show()
#     plt.draw()
    
    ae["session"] = sess
    
    return ae

In [None]:
ae = test_hollywood(X_train, X_test, n_filters=[3, 5, 5, 5, 5],
                filter_sizes=[3, 3, 3, 3])

In [None]:
del ae, sess, X_test

In [None]:
train = X_train[:40000]
combined = []
sess = ae["session"]
batch_size = 100
for batch_i in range(train.shape[0] // batch_size):
    batch_xs = train[batch_i * batch_size:(batch_i + 1) * batch_size]
    layers = [sess.run(ae["encoder"][i], 
            feed_dict={ae['x']: batch_xs}) for i in range(len(ae["encoder"]))]
    ravels = (np.array([row.ravel() for row in layers[i]]) for i in range(len(ae["encoder"])))
    interm = np.hstack((ravels))
    combined.append(interm)
    
combined = np.vstack((combined))
print combined.shape

In [None]:
from scipy.spatial import cKDTree
import random

test_idxes = random.sample(range(combined.shape[0]), 10)
# test_idxes = random.sample(np.where(y_train == 0)[0], 10)
print test_idxes

tree = cKDTree(combined)
test_array = combined[test_idxes]
query_res = tree.query(test_array, k=20)

nns = []

for idx, row in enumerate(query_res[1]):
    nn = train[row.ravel()]
#     to_plot = np.vstack([X_test[idx], nn])
    nns.append(nn)
    
nns = np.stack(nns)

In [None]:
nns.shape

In [None]:
def plot_nearest_neighbors(nns, reshape=(height, width, 3), cols=5, reverse=False):
    fig, axs = plt.subplots(nns.shape[0], cols, figsize=(32, 32))
    
    for i in range(nns.shape[0]):
        for j in range(cols):
            neighbor_index = -1 * j if reverse else j
            if len(reshape) != 2:
                axs[i][j].imshow(np.mean(nns[i, neighbor_index], axis=2), cmap="Greys_r")
            else:
                axs[i][j].imshow(
                    nns[i, neighbor_index][:,:,0].reshape(reshape), cmap="Greys_r")
            axs[i][j].axis("off")
    fig.subplots_adjust(wspace=0, hspace=0)
    fig.tight_layout()

In [None]:
plot_nearest_neighbors(nns, reverse=False)