In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
from __future__ import print_function

import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.contrib import legacy_seq2seq

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Process Joint Labels

In [None]:
directory = 'labels/'
frames = []
for filename in os.listdir(directory):
    annotations = loadmat(directory + filename)
    if annotations['action'][0] == 'squat':
        # Create Nx13x2 joint labels for each video
        frames.append(np.stack([annotations['x'], annotations['y']], axis=2))

In [None]:
# Keep only videos with more than 70 image frames
top_frames = []
for i in range(231):
    if frames[i].shape[0] > 70:
        top_frames.append(frames[i])


frames_train = top_frames[:150]
frames_test = top_frames[150:]

In [None]:
len(top_frames)

# LSTM Params

In [None]:
L = 13 # num of joints
k = 50 # training num
T = 10 # prediction num
H = 1024 # hidden layer size

In [None]:
def RNN(p, weights, biases):
    # p should be shape (batch_size, T, 2 * L)
    # unstack gets us a list of T (batch_size, 2 * L) tensors
    stacked_lstm = rnn.MultiRNNCell([rnn.BasicLSTMCell(H, forget_bias=1.0) for _ in range(2)])
    batch_size = tf.shape(p)[0]
    p = tf.unstack(p, k, axis=1)
    outputs, states = legacy_seq2seq.basic_rnn_seq2seq(p,  [p[-1]]*T, stacked_lstm)
    
    # outputs is a list of T (batch_size, H) arrays
    # concat outputs is (batch_size * T, H)
    concat_outputs = tf.concat(outputs, axis=0)
    
    # predictions is (batch_size * T, 2 * L)
    predictions = tf.matmul(concat_outputs, weights) + biases
    
    # reshape into (T, batch_size, 2 * L) then transpose into (batch_size, T, 2 * L)
    return tf.transpose(tf.reshape(predictions, (T, batch_size, L * 2)), perm=[1, 0, 2])

In [None]:
tf.reset_default_graph()

# Parameters
learning_rate = 0.001
epochs = 2000
batch_size = 10
n_videos = len(frames_train)
display_step = 50

p_input = tf.placeholder(tf.float32, shape=[None, k, L*2])
p_output = tf.placeholder(tf.float32, shape=[p_input.get_shape()[0], T, L*2])

W = tf.get_variable('W', shape=[H, L*2], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('b', shape=[L*2], dtype=tf.float32, initializer=tf.zeros_initializer())

p_output_predicted = RNN(p_input, W, b)

# Define loss and optimizer
loss = tf.reduce_mean(tf.squared_difference(p_output_predicted, p_output))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# LSTM Training/Validation

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# saver = tf.train.Saver()
# saver.restore(sess, 'lstm-reg-20000')
mean_losses = []
for epoch in range(epochs):
    total_iter = n_videos // batch_size
    total_iter = 1
    total_loss = 0
    for i in range(total_iter):
        inputs = []
        expected_outputs = []
        for frame in frames_train:
            start_time =  np.random.randint(frame.shape[0] - (k + T) + 1)
            inputs.append(frame[start_time : start_time + k].reshape(k, 2 * L))
            expected_outputs.append(frame[start_time + k : start_time + k + T].reshape(T, 2 * L))
        _, loss_value = sess.run((optimizer, loss), feed_dict={ p_input : np.asarray(inputs), p_output : np.asarray(expected_outputs) })
        total_loss += loss_value
    mean_loss = total_loss / total_iter
    mean_losses.append(mean_loss)
    if (epoch + 1) % display_step == 0:
        print('epoch %s: loss=%.4f' % (epoch + 1, mean_loss))

In [None]:
inputs = []
expected_outputs = []
for frame in frames_train:
    start_time =  np.random.randint(frame.shape[0] - (k + T) + 1)
    inputs.append(frame[start_time : start_time + k].reshape(k, 2 * L))
    expected_outputs.append(frame[start_time + k : start_time + k + T].reshape(T, 2 * L))

output = sess.run((p_output_predicted), feed_dict={ p_input : np.asarray(inputs)})


In [None]:
for i in range(T):
    print(np.mean(np.linalg.norm(
        output.reshape((1, T, 13, 2))[:,i,:,:] - np.array(expected_outputs).reshape((1, T, 13, 2))[:,i,:,:],
        axis=2)))

In [None]:
for i in range(T):
    if i % 1 == 0:
        image = i
        print('T = ', i)
        plt.subplot(1,2,1)
        plt.imshow(np.zeros((1,1)), cmap = 'gray')
        plt.scatter((output[0][image].reshape(13,2)).T[0], (output[0][image].reshape(13,2)).T[1])
        plt.subplot(1,2,2)
        plt.imshow(np.zeros((1,1)), cmap = 'gray')
        plt.scatter((expected_outputs[0][image].reshape(13,2)).T[0], (expected_outputs[0][image].reshape(13,2)).T[1])
        plt.show()

In [None]:
for i in range(T):
    if i % 1 == 0:
        image = i
        print('T = ', i)
        print((output[0][image].reshape(13,2)).T[0], (output[0][image].reshape(13,2)).T[1])

In [None]:
saver = tf.train.Saver()
saver.save(sess, 'lstm-reg', global_step=20000)