# Stack multiple frames to create a volume

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.datasets import cifar10

import tensorflow as tf

import PIL
from PIL import Image
import cv2
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os
import datetime
import math
from scipy.misc import imresize

from sklearn.cross_validation import train_test_split

video_files = os.listdir("videoclips")
annotation_files = os.listdir("annotations")

video_files.remove("example.avi")

def log(string):
    print "%s: %s" % (str(datetime.datetime.now())[:-6], string)
    
proportion = 1.0

with open("annotations/train_clean.txt") as fp:
    train_clean = fp.readlines()
    train_clean = [f[:-2] for f in train_clean]
    print len(train_clean)
    train_clean = np.random.choice(train_clean, int(len(train_clean) * proportion))

with open("annotations/test_clean.txt") as fp:
    test_clean = fp.readlines()
    test_clean = [f[:-2] for f in test_clean]
    test_clean = np.random.choice(test_clean, int(len(test_clean) * proportion))
    
with open("annotations/train_auto.txt") as fp:
    train_auto = fp.readlines()
    train_auto = [f[:-2] for f in train_auto]
    train_auto = np.random.choice(train_auto, int(len(train_auto) * proportion))
    
# def get_frames(fname, resize=(128, 128), trim=0.2, grey=False, show=False):
def get_frames(fname, resize=(100, 100), trim=None, grey=False, show=False):
    """
    Get all frames from a video file
    Also have option to trim the number of frames
    """
    cap = cv2.VideoCapture(fname)
    frames = []
    originals = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = frame / 255.
        originals.append(frame)
        if resize is not None:
#             frame = np.array(Image.fromarray(np.array(frame)).resize(resize, PIL.Image.ANTIALIAS))
#             frame = np.resize(frame, resize)
#             frame = imresize(frame, resize)
            frame = cv2.resize(frame, resize)
        if grey:
            frame = np.mean(frame, axis=2)        
        frames.append(frame)

    if show:
        f1 = frames[1]
        cv_rgb = f1
        log("Shape: ", cv_rgb.shape)
        plt.imshow(cv_rgb)
        plt.show()

    log("Number of frames: %i" % len(frames))
    cap.release()
    cv2.destroyAllWindows()

    if trim is not None:
        # Trim should < 0.5
        trim = int(len(frames) * trim)
        frames = frames[trim:-trim]
        log("Trimmed to %i frames" % len(frames))
    
    return frames, originals

video_to_frame_to_label = {fname: {} for fname in video_files}
# stores = [train_clean, test_clean, train_auto]
stores = [train_clean, test_clean]

label_bank = {"None": 0, "SitUp": 1, "GetOutCar": 2, "StandUp": 3, "AnswerPhone": 4,
             "Kiss": 5, "HugPerson": 6, "HandShake": 7, "SitDown": 8}

for idx, store in enumerate(stores):
    for line in store:
        video_title = line.split("\"")[1]
        begin, end = map(int, line.split("(")[1].split(")")[0].split("-"))
        labels = []
        label_group = line.split("<")[1:]
        for l in label_group:
            label = l.split(">")[0]
            labels.append(label_bank[label])
        for i in range(begin - 1, end + 1):
            if i in video_to_frame_to_label[video_title]:
                video_to_frame_to_label[video_title][i] = \
                        video_to_frame_to_label[video_title][i] + labels                           
            else:
                video_to_frame_to_label[video_title][i] = labels

# group_adjacent = lambda a, k: zip(*([iter(a)] * k))                
from itertools import islice
group_adjacent = lambda a, k: zip(*(islice(a, i, None, k) for i in range(k)))
from scipy import stats

def group_frames(fname, volume_size):
    frames, originals = get_frames(fname, trim=None, grey=True)
    if len(frames) < 1:
        return None, None
    video_title = fname.split("/")[1]
    return_vols = []
    return_labels = []
    return_originals = []
    grouped_frames = group_adjacent(frames, volume_size)
    for idx, group in enumerate(grouped_frames):
        absolute_pos = idx * volume_size
        vol = np.stack((group)).transpose((1, 2, 0))
        if absolute_pos in video_to_frame_to_label[video_title]:
            labels = video_to_frame_to_label[video_title][absolute_pos]
            print "Frame", absolute_pos, " labels", labels
            return_labels.append(stats.mode(labels)[0][0])
            return_vols.append(vol)
        else:
            return_vols.append(vol)
            return_labels.append(0)
    if len(return_vols) == 0:
        return None, None
    final_vols = np.stack((return_vols))
    return final_vols, return_labels

In [None]:
np.concatenate((train_clean, test_clean)).shape

train_and_test_files = train_clean = np.concatenate((train_clean, test_clean))

In [None]:
print train_clean[-2]
print train_clean[-2].split("\"")[1]

In [None]:
sample_f = "videoclips/" + train_clean[-2].split("\"")[1]
frames, originals = get_frames(sample_f, resize=(100, 100), trim=None, grey=True)

In [None]:
frames[0].shape

In [None]:
plt.imshow(frames[0])

In [None]:
video = "videoclips/" + train_clean[-2].split("\"")[1]
vols, labels = group_frames(video, 50)
print vols.shape
print labels

In [None]:
len(video_to_frame_to_label.keys())

In [None]:
def get_all_volumes_and_labels(f_store, vol_size=20):
    all_vols, all_labels = [], []
    for f in f_store:
        video = "videoclips/" + f.split("\"")[1]
        vols, labels = group_frames(video, vol_size)
        if vols is not None:
            all_vols.append(vols)
            all_labels = all_labels + labels
    return all_vols, all_labels

vol = 50
train_frames, train_labels = get_all_volumes_and_labels(train_and_test_files, vol_size=vol)

# Let's try to make an LMDB out of this

In [None]:
import caffe
import lmdb

X_train = np.vstack((train_frames))
y_train = np.array(train_labels)
print X_train.shape
print len(y_train)

In [None]:
np.bincount(y_train)

In [None]:
# test_frames, test_labels = get_all_volumes_and_labels(test_clean)
# X_test = np.vstack((test_frames))
# y_test = np.array(test_labels)
# print X_test.shape
# print len(y_test)

In [None]:
# np.bincount(y_test)

In [None]:
# from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, UpSampling2D
# from keras.models import Model

# import tensorflow as tf
# config = tf.ConfigProto()
# config.gpu_options.allow_growth=True
# sess = tf.Session(config=config)

# from keras import backend as K
# K.set_session(sess)

# height, width = 128, 128
# vol = 20

# input_img = Input(shape=(height, width, vol))

# # input_img = tf.placeholder(tf.float32, [None, 32, 32, 3], name="x")

# encoding_layers = []
# kernel_sizes = [3, 3, 3, 3, 3]
# num_filters = [10, 10, 10, 10, 10]

# kernel_sizes = [5, 5, 5, 3, 3]
# num_filters = [32, 32, 64, 10, 10]

# kernel_sizes = [5, 5, 5, 5, 5]
# num_filters = [48, 48, 48, 48, 48]

# kernel_sizes = [3 for i in range(3)]
# num_filters = [vol for i in range(3)]

# encoded = []
# for idx, (kernel, f) in enumerate(zip(kernel_sizes, num_filters)):
#     if idx == 0:
#         x = Convolution2D(f, kernel, kernel, activation="sigmoid", border_mode="same")(input_img)
#     else:
#         x = Convolution2D(f, kernel, kernel, activation="sigmoid", border_mode="same")(x)
#     x = MaxPooling2D((2, 2), border_mode="same")(x)
#     encoding_layers.append(x)
#     encoded = x

# prev = []
# for idx, (kernel, f) in enumerate(zip(kernel_sizes[::-1], num_filters[::-1])):
#     if idx == 0:
#         x = Convolution2D(f, kernel, kernel, activation="sigmoid", border_mode="same")(encoded)
#     else:
#         x = Convolution2D(f, kernel, kernel, activation="sigmoid", border_mode="same")(x)
#     x = UpSampling2D((2, 2))(x)
#     prev = x

# decoded = Convolution2D(vol, 3, 3, activation="sigmoid", border_mode="same")(prev)



# encoder = Model(input=input_img, output=encoded)

# autoencoder = Model(input_img, decoded)
# autoencoder.compile(optimizer='adadelta', loss='mse')

In [None]:
# layers = []

# with sess.as_default():
#     autoencoder.fit(np.array(X_train), np.array(X_train), nb_epoch=50,
#                batch_size=100, shuffle=True, 
#                 validation_data=(np.array(X_train), np.array(X_train)),
#                verbose=0)
#     testing = X_train
#     layers = [layer.eval(feed_dict={input_img: testing}) 
#                   for layer in encoding_layers]

In [None]:
height = width = 100

def autoencoder(input_shape=[None, height, width, vol],
                n_filters=[3, 30, 50, 50],
                filter_sizes=[5, 10, 10, 10],
                corruption=False):
    # input to the network
    x = tf.placeholder(tf.float32, input_shape, name='x')

    # ensure 2-d is converted to square tensor.
    if len(x.get_shape()) == 2:
        x_dim = np.sqrt(x.get_shape().as_list()[1])
        if x_dim != int(x_dim):
            raise ValueError('Unsupported input dimensions')
        x_dim = int(x_dim)
        x_tensor = tf.reshape(
            x, [-1, x_dim, x_dim, n_filters[0]])
    elif len(x.get_shape()) == 4:
        x_tensor = x
    else:
        raise ValueError('Unsupported input dimensions')
    current_input = x_tensor

    # Build the encoder
    encoder_weights = []
    encoder_ops = []
    shapes = []
    for layer_i, n_output in enumerate(n_filters[1:]):
        n_input = current_input.get_shape().as_list()[3]
        shapes.append(current_input.get_shape().as_list())
        W = tf.Variable(
            tf.random_uniform([
                filter_sizes[layer_i],
                filter_sizes[layer_i],
                n_input, n_output],
                -1.0 / math.sqrt(n_input),
                1.0 / math.sqrt(n_input)))
        b = tf.Variable(tf.zeros([n_output]))
        encoder_weights.append(W)
        output = tf.nn.sigmoid(
            tf.add(tf.nn.conv2d(
                current_input, W, strides=[1, 1, 1, 1], padding='SAME'), b))
        encoder_ops.append(output)
        current_input = output

    # store the latent representation
    z = current_input
    encoder_weights.reverse()
    shapes.reverse()

    # Build the decoder using the same weights
    for layer_i, shape in enumerate(shapes):
        W = encoder_weights[layer_i]
        b = tf.Variable(tf.zeros([W.get_shape().as_list()[2]]))
        output = tf.nn.sigmoid(
            tf.add(tf.nn.conv2d_transpose(
                current_input, W,
                tf.pack([tf.shape(x)[0], shape[1], shape[2], shape[3]]),
                strides=[1, 1, 1, 1], padding='SAME'), b))
        current_input = output
        
    decoder = current_input

    # now have the reconstruction through the network
    y = current_input
    # cost function measures pixel-wise difference
    cost = tf.reduce_sum(tf.square(y - x_tensor))

    return {'x': x, 'z': z, 'y': y, 'cost': cost, 
            "encoder": encoder_ops, "decoder": decoder}


def test_hollywood(X_train, X_test, n_filters, filter_sizes):
    import tensorflow as tf
    ae = autoencoder(n_filters=n_filters, filter_sizes=filter_sizes)

    learning_rate = 0.001
    optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(ae['cost'])

    # We create a session to use the graph
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)
    sess.run(tf.initialize_all_variables())

    # Fit all training data
    batch_size = 100
    n_epochs = 100
    step_size = 10
    for epoch_i in range(n_epochs):
        for batch_i in range(X_train.shape[0] // batch_size):
            batch_xs = X_train[batch_i * batch_size:(batch_i + 1) * batch_size]
            train = batch_xs
            sess.run(optimizer, feed_dict={ae['x']: train})
        if epoch_i % step_size == 0:
            print(str(datetime.datetime.now()), epoch_i, sess.run(ae['cost'], feed_dict={ae['x']: train}))

    ae["session"] = sess
    
    return ae

In [None]:
ae = test_hollywood(X_train, X_train,
                n_filters=[vol, 20, 20, 20],
                filter_sizes=[3, 3, 3, 3])

In [None]:
train = X_train
combined = []
sess = ae["session"]
batch_size = 100
for batch_i in range(train.shape[0] // batch_size):
    batch_xs = train[batch_i * batch_size:(batch_i + 1) * batch_size]
    layers = [sess.run(ae["encoder"][i], 
            feed_dict={ae['x']: batch_xs}) for i in range(len(ae["encoder"]))]
    ravels = (np.array([row.ravel() for row in layers[i]]) for i in range(len(ae["encoder"])))
    interm = np.hstack((ravels))
    combined.append(interm)

In [None]:
# ravels = (np.array([row.ravel() for row in layers[i]]) for i in range(len(encoding_layers)))
combined = np.vstack((combined))

del ae, sess

# Scale and visualize the embedding vectors
def plot_embedding(X, y, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure(figsize=(20, 10))
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.Set1(y[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 12})

    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

vectorized_imgs = combined

In [None]:
combined.shape

In [None]:
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
# X_tsne = tsne.fit_transform(vectorized_imgs)

In [None]:
# plot_embedding(X_tsne, y_train)

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=3)
# knn.fit(X_tsne, y_train[:1600])

In [None]:
# knn.score(X_tsne, y_train[:1600])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(combined, y_train[:combined.shape[0]])

In [None]:
knn.score(combined, y_train[:combined.shape[0]])

In [None]:
y_train = y_train[:combined.shape[0]]

In [None]:
from scipy.spatial import cKDTree
import random

# test_idxes = random.sample(range(combined.shape[0]), 10)
test_idxes = random.sample(np.where(y_train == 0)[0], 10)
print test_idxes

tree = cKDTree(combined)
test_array = combined[test_idxes]
query_res = tree.query(test_array, k=10)

nns = []

for idx, row in enumerate(query_res[1]):
    nn = X_train[row.ravel()]
#     to_plot = np.vstack([X_test[idx], nn])
    nns.append(nn)
    
nns = np.stack(nns)

In [None]:
nns.shape

In [None]:
def plot_nearest_neighbors(nns, reshape=(100, 100), cols=5, reverse=False):
    fig, axs = plt.subplots(nns.shape[0], cols, figsize=(32, 32))
    
    for i in range(nns.shape[0]):
        for j in range(cols):
            neighbor_index = -1 * j if reverse else j
            if len(reshape) != 2:
                axs[i][j].imshow(
                    nns[i, neighbor_index][:,:,0].reshape(reshape))
            else:
                axs[i][j].imshow(
                    nns[i, neighbor_index][:,:,0].reshape(reshape), cmap="Greys_r")
            axs[i][j].axis("off")
    fig.subplots_adjust(wspace=0, hspace=0)
    fig.tight_layout()

In [None]:
plot_nearest_neighbors(nns, reverse=True)

In [None]:
plot_nearest_neighbors(nns)

In [None]:
nns.shape

In [None]:
def plot_nn_volumes(nns, reshape=(100, 100), row_examples=2, cols=5, reverse=False):
    vol_size = cols
    skip = 7
    fig, axs = plt.subplots(row_examples * cols, vol_size, figsize=(32, 32))
    
    for i in range(row_examples):
        for j in range(cols):
            neighbor_index = -1 * j if reverse else j
            for k in range(vol_size): # Number of frames in the volume we want to see
                if len(reshape) != 2:
                    axs[i * cols + j][k].imshow(
                        nns[i, neighbor_index][:,:,k * skip].reshape(reshape))
                else:
                    axs[i * cols + j][k].imshow(
                        nns[i, neighbor_index][:,:,k * skip].reshape(reshape), cmap="Greys_r")
                axs[i * cols + j][k].axis("off")
    fig.subplots_adjust(wspace=0, hspace=0)
    fig.tight_layout()

In [None]:
plot_nn_volumes(nns[5:], reverse=True)

# just knn

In [None]:
X_train_ravelled = np.array([row.ravel() for row in X_train])
X_train = X_train_ravelled

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [None]:
knn.score(X_train, y_train)

# just k means

In [None]:
from sklearn.cluster import KMeans

X_train = X_train[:5000]
y_train = y_train[:5000]

kmeans = KMeans(n_clusters=6, random_state=0)
kmeans.fit(X_train)

def cluster_acc(Y_pred, Y):
    """
    Finds the cluster accuracy
    """
    from sklearn.utils.linear_assignment_ import linear_assignment
    Y_pred = np.array(Y_pred)
    Y = np.array(Y)
    D = max(Y_pred.max(), Y.max())+1
    w = np.zeros((D,D), dtype=np.int64)
    for i in xrange(Y_pred.size):
        w[Y_pred[i], Y[i]] += 1
    ind = linear_assignment(w.max() - w)
    return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w

y_pred = kmeans.predict(X_train)
cluster_acc(y_pred, y_train)

# just t-SNE

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
np.set_printoptions(suppress=True)
X_tsne = tsne.fit_transform(X_train)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_tsne, y_train)

In [None]:
knn.score(X_tsne, y_train[:X_tsne.shape[0]])