In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="1"
# from __future__ import print_function

import tensorflow as tf
from tensorflow.contrib import rnn

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Process Joint Labels

In [None]:
directory = 'labels/'
labeled_frames = []
for filename in os.listdir(directory):
    annotations = loadmat(directory + filename)
    if annotations['action'][0] == 'squat':
        frame_height, frame_width = annotations['dimensions'][0][:2]
        # Create Nx13x2 joint labels for each video
        xy = np.stack([annotations['x'], annotations['y']], axis=2).astype(float)
        bboxes = annotations['bbox']
        x_min, y_min = np.min(bboxes[:, :2], axis=0)
        x_max, y_max = np.max(bboxes[:, 2:], axis=0)
        xy_min = np.array([x_min, y_min]).reshape(1, 1, 2)
        xy_range = np.array([x_max - x_min, y_max - y_min]).reshape(1, 1, 2)
        normed_coord = (xy - xy_min) / xy_range
        normed_coord[normed_coord < 0] = 0
        feature_dict = {
            'file' : filename,
            'nframes' : annotations['nframes'],
            'pose' : annotations['pose'],
            'coord' : xy,
            'norm_coord' : normed_coord,
            'visibility' : annotations['visibility'],
            'y_min' : y_min,
            'y_max' : y_max,
            'x_min' : x_min,
            'x_max' : x_max
        }
        labeled_frames.append(feature_dict)
labeled_frames = sorted(labeled_frames, key=lambda x: x['file'])
poses = set(f['pose'][0] for f in labeled_frames)
print(poses)

In [None]:
selected_frames = np.array([f['norm_coord'] for f in labeled_frames if f['pose'] == u'left' and f['nframes'] >= 70])
indices = np.random.permutation(np.arange(len(selected_frames)))
frames_train = selected_frames[indices[:65]]
frames_test = selected_frames[indices[65:]]

# LSTM Params

In [None]:
L = 13 # num of joints
# shortest video is length 25, so k + T = 25 in this case
k = 15 # training num
T = 45 # prediction num
H = 1024 # hidden layer size

In [None]:
def RNN(p, weights, biases):
    # p should be shape (batch_size, T, 2 * L)
    # unstack gets us a list of T (batch_size, 2 * L) tensors
    batch_size = tf.shape(p)[0]
    p = tf.unstack(p, k, axis=1)
    lstm_cell = rnn.BasicLSTMCell(H, forget_bias=1.0)
    outputs, states = rnn.static_rnn(lstm_cell, p, dtype=tf.float32)
    
    # Using generated output as input for next cell
#     output_state = rnn.LSTMStateTuple(states[-1], outputs[-1])
#     input_state = tf.matmul(outputs[-1], W) + b
#     predictions = []
#     for i in range(T):
#         lstm_cell_pred = rnn.BasicLSTMCell(H, forget_bias=1.0, reuse=True)
#         output, state = rnn.static_rnn(lstm_cell_pred, [input_state],
#                                          initial_state=output_state,  dtype=tf.float32)
        
#         input_state = tf.matmul(output[0], W) + b
        
#         predictions.append(input_state)
#         output_state = state
    
    output_state = rnn.LSTMStateTuple(states[-1], outputs[-1])
    lstm_cell_pred = rnn.BasicLSTMCell(H, forget_bias=1.0, reuse=True)
    outputs, states = rnn.static_rnn(lstm_cell_pred, [tf.zeros((batch_size, L*2))] * T,
                                     initial_state=output_state,  dtype=tf.float32)
    
    # outputs is a list of T (batch_size, H) arrays
    # concat outputs is (batch_size * T, H)
    concat_outputs = tf.concat(outputs, axis=0)
    
    # predictions is (batch_size * T, 2 * L)
    predictions = tf.nn.sigmoid(tf.matmul(concat_outputs, W) + b)
    predictions = tf.sigmoid(tf.matmul(predictions, W1) + b1)
    
    # reshape into (T, batch_size, 2 * L) then transpose into (batch_size, T, 2 * L)
    return tf.transpose(tf.reshape(predictions, (T, batch_size, L * 2)), perm=[1, 0, 2])

In [None]:
tf.reset_default_graph()
sess = tf.Session()

learning_rate = 0.0005

p_input = tf.placeholder(tf.float32, shape=[None, k, L*2])
p_output = tf.placeholder(tf.float32, shape=[p_input.get_shape()[0], T, L*2])

W = tf.get_variable('W', shape=[H, 100], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('b', shape=[100], dtype=tf.float32, initializer=tf.zeros_initializer())

W1 = tf.get_variable('W1', shape=[100, L*2], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.get_variable('b1', shape=[L*2], dtype=tf.float32, initializer=tf.zeros_initializer())

p_output_predicted = RNN(p_input, W, b)

# Define loss and optimizer
loss = tf.reduce_mean(tf.squared_difference(p_output_predicted, p_output))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

sess.run(tf.global_variables_initializer())

# LSTM Training/Validation

In [None]:
epochs = 4000
batch_size = 1
n_videos = len(frames_train)
display_step = 50
save_step = 500

saver = tf.train.Saver()

mean_losses = []
for epoch in range(epochs):
    total_iter = n_videos // batch_size
    total_loss = 0
    for i in range(total_iter):
        inputs = []
        expected_outputs = []
        for frame in frames_train:
            start_time = np.random.randint(frame.shape[0] - (k + T))
            inputs.append(frame[start_time : start_time + k].reshape(k, 2 * L))
            expected_outputs.append(frame[start_time + k : start_time + k + T].reshape(T, 2 * L))
        _, loss_value = sess.run((optimizer, loss), feed_dict={ p_input : np.asarray(inputs), p_output : np.asarray(expected_outputs) })
        total_loss += loss_value
    mean_loss = total_loss / total_iter
    mean_losses.append(mean_loss)
    if (epoch + 1) % display_step == 0:
        print('epoch %s: loss=%.8f' % (epoch + 1, mean_loss))
    if (epoch + 1) % save_step == 0:
        saver.save(sess, 'models/lstm-zee', global_step=(epoch + 1))

In [None]:
saver = tf.train.Saver()
saver.save(sess, 'models/lstm-zee', global_step=350)

In [None]:
frames_ = [frame for frame in frames_test if frame.shape[0] >= k + T]
inputs = []
expected_outputs = []
for frame in frames_:
    start_time = np.random.randint(frame.shape[0] - (k + T) + 1)
#     start_time = 0
    inputs.append(frame[start_time : start_time + k].reshape(k, 2 * L))
    expected_outputs.append(frame[start_time + k: start_time + k + T].reshape(T, 2 * L))

output = sess.run((p_output_predicted), feed_dict={ p_input : np.asarray(inputs)}).reshape(len(frames_), T, L, 2)
expected_output = np.asarray(expected_outputs).reshape(len(frames_), T, L, 2)

In [None]:
for i in range(T):
    print(np.mean(np.linalg.norm(output[:,i,:,:] - expected_output[:,i,:,:], axis=2)))

In [None]:
%matplotlib notebook

import numpy as np
from matplotlib import pyplot as plt
from matplotlib import animation
from matplotlib import cm

# videos: a list of (T, L, 2) arrays
frame_data = [frame for frame in labeled_frames if frame['pose'] == u'left'][:6]
videos = [frame['norm_coord'] for frame in frame_data]
i = 6
videos = [output[i], expected_output[i]]

fig = plt.figure()
ax = plt.axes(xlim=(0, 1), ylim=(0, 1))
colors = ['red', 'orange', 'yellow', 'green', 'blue', 'cyan', 'magenta', 'violet', 'black']
lines = [ax.plot([], [], 'o', color=colors[i])[0] for i in range(len(videos))]

def init():
    [line.set_data([], []) for line in lines]
    return lines

def animate(t):
    [line.set_data(*video[t].T) for line, video in zip(lines, videos)]
    return lines
nframes = min(len(video) for video in videos)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=nframes, interval=nframes, blit=True)
plt.show()