In [None]:
from PIL import Image
import cv2
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os, time
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import math
import datetime
import random

height, width = 100,100
resize = (height, width)

def get_frames(fname, resize=resize, num_frames=None):
    cap = cv2.VideoCapture(fname)
    rets, frames = [], []
    i = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        rets.append(ret)
        frame = np.array(Image.fromarray(frame.astype(np.uint8)).resize(resize, 
                                    Image.ANTIALIAS)) / 255.0
        frame = np.mean(frame, axis=2)        
        frames.append(frame)
        i += 1
        if num_frames is not None and i == num_frames:
            break

    cap.release()
    cv2.destroyAllWindows()
    return np.array(frames)

label_hash = {
    0: "boxing", 1: "handclapping", 2: "handwaving", 3: "jogging", 4: "running",
    5: "walking"
}

reverse_labels = {val:key for key, val in label_hash.items()}

num_labels = 6

# group_adjacent = lambda a, k: zip(*([iter(a)] * k))                
from itertools import islice
group_adjacent = lambda a, k: zip(*(islice(a, i, None, k) for i in range(k)))

video_files = []
for i in range(num_labels):
    video_files = video_files + os.listdir(label_hash[i])

random.shuffle(video_files)

# video_files = [os.listdir(label_hash[i]) for i in range(num_labels)]

In [None]:
def group_frames(fname, vol_size):
    action_name = fname.split("_")[1]
    label = reverse_labels[action_name]
    full_path = action_name + "/" + fname
    frames = get_frames(full_path)
    return_vols, return_labels = [], []
    grouped_frames = group_adjacent(frames, vol_size)
    for group in grouped_frames:
        vol = np.stack((group)).transpose((1, 2, 0))
        return_vols.append(vol)
        return_labels.append(label)
    final_vols = np.stack((return_vols))
    return final_vols, return_labels

def get_all_volumes_and_labels(f_store, vol_size=10):
    all_vols, all_labels = [], []
    print "%i videos in total" % len(f_store)
    for idx, f in enumerate(f_store):
        vols, labels = group_frames(f, vol_size)
        all_vols.append(vols)
        all_labels = all_labels + labels
        print "Video %i, added %i sequences" % (idx, len(vols))
    volumes = np.vstack((all_vols))
    labels = np.array(all_labels)
    return volumes, labels

In [None]:
vol_size = 5
train, labels = get_all_volumes_and_labels(video_files, vol_size=vol_size)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.5)

In [None]:
X_train.shape

In [None]:
vol = vol_size

def autoencoder(input_shape=[None, height, width, vol],
                n_filters=[3, 30, 50, 50],
                filter_sizes=[5, 10, 10, 10],
                corruption=False):
    # input to the network
    x = tf.placeholder(tf.float32, input_shape, name='x')

    # ensure 2-d is converted to square tensor.
    if len(x.get_shape()) == 2:
        x_dim = np.sqrt(x.get_shape().as_list()[1])
        if x_dim != int(x_dim):
            raise ValueError('Unsupported input dimensions')
        x_dim = int(x_dim)
        x_tensor = tf.reshape(
            x, [-1, x_dim, x_dim, n_filters[0]])
    elif len(x.get_shape()) == 4:
        x_tensor = x
    else:
        raise ValueError('Unsupported input dimensions')
    current_input = x_tensor

    # Build the encoder
    encoder_weights = []
    encoder_ops = []
    shapes = []
    for layer_i, n_output in enumerate(n_filters[1:]):
        n_input = current_input.get_shape().as_list()[3]
        shapes.append(current_input.get_shape().as_list())
        W = tf.Variable(
            tf.random_uniform([
                filter_sizes[layer_i],
                filter_sizes[layer_i],
                n_input, n_output],
                -1.0 / math.sqrt(n_input),
                1.0 / math.sqrt(n_input)))
        b = tf.Variable(tf.zeros([n_output]))
        encoder_weights.append(W)
        output = tf.nn.sigmoid(
            tf.add(tf.nn.conv2d(
                current_input, W, strides=[1, 1, 1, 1], padding='SAME'), b))
        encoder_ops.append(output)
        current_input = output

    # store the latent representation
    z = current_input
    encoder_weights.reverse()
    shapes.reverse()

    # Build the decoder using the same weights
    for layer_i, shape in enumerate(shapes):
        W = encoder_weights[layer_i]
        b = tf.Variable(tf.zeros([W.get_shape().as_list()[2]]))
        output = tf.nn.sigmoid(
            tf.add(tf.nn.conv2d_transpose(
                current_input, W,
                tf.pack([tf.shape(x)[0], shape[1], shape[2], shape[3]]),
                strides=[1, 1, 1, 1], padding='SAME'), b))
        current_input = output
        
    decoder = current_input

    # now have the reconstruction through the network
    y = current_input
    # cost function measures pixel-wise difference
    cost = tf.reduce_sum(tf.square(y - x_tensor))

    return {'x': x, 'z': z, 'y': y, 'cost': cost, 
            "encoder": encoder_ops, "decoder": decoder}


def test_kth(X_train, X_test, n_filters, filter_sizes):
    import tensorflow as tf
    ae = autoencoder(n_filters=n_filters, filter_sizes=filter_sizes)

    learning_rate = 0.01
    optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(ae['cost'])

    # We create a session to use the graph
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    # Fit all training data
    batch_size = 100
    n_epochs = 100
    step_size = 10
    for epoch_i in range(n_epochs):
        for batch_i in range(X_train.shape[0] // batch_size):
            batch_xs = X_train[batch_i * batch_size:(batch_i + 1) * batch_size]
            train = batch_xs
            sess.run(optimizer, feed_dict={ae['x']: train})
        if epoch_i % step_size == 0:
            print(str(datetime.datetime.now()), epoch_i, sess.run(ae['cost'], feed_dict={ae['x']: train}))

    ae["session"] = sess
    
    return ae

In [None]:
ae = test_kth(X_train, X_test,
                n_filters=[vol, 5, 5, 5],
                filter_sizes=[3, 3, 3])

In [None]:
train = X_train
combined = []
sess = ae["session"]
batch_size = 100
for batch_i in range(train.shape[0] // batch_size):
    batch_xs = train[batch_i * batch_size:(batch_i + 1) * batch_size]
    layers = [sess.run(ae["encoder"][i], 
            feed_dict={ae['x']: batch_xs}) for i in range(len(ae["encoder"]))]
    ravels = (np.array([row.ravel() for row in layers[i]]) for i in range(len(ae["encoder"])))
    interm = np.hstack((ravels))
    combined.append(interm)

del sess, ae, X_train, train

In [None]:
combined = np.vstack((combined))
print combined.shape

In [None]:
combined = combined[:5000]

In [None]:
# Scale and visualize the embedding vectors
def plot_embedding(X, y, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure(figsize=(20, 10))
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.Set1(y[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 12})

    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

vectorized_imgs = combined

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
print knn.fit(combined, y_train[:combined.shape[0]])
print str(datetime.datetime.now())

In [None]:
print knn.score(combined, y_train[:combined.shape[0]])
print str(datetime.datetime.now())