<a href="https://colab.research.google.com/github/rayane2907/Action_recognition_pose_estimation/blob/main/Joint_2D_and_3D_Pose_Estimation_and_Action_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Joint 2D and 3D Pose Estimation and Action Recognition**
This whole project is based on the research paper of 2D/3D Pose Estimation and Action Recognition using Multitask Deep Learning by Diogo C. Luvizon1
, David Picard and Hedi Tabia and . Hence, most of the ideas are driven from that research paper, however, I changed the Network architecture from Inception-V4  into VGG 16.

link: https://github.com/dluvizon/deephar/tree/master

In [None]:
!pip install tensorflow[and-cuda]

In [None]:
import copy
import cv2
import gzip
import json
import numpy as np
import os
import queue
import random
import scipy.io as sio
import tensorflow as tf
import threading
import time
from google.colab.patches import cv2_imshow
from keras import backend as K
from keras.callbacks import Callback, LearningRateScheduler, ProgbarLogger, TensorBoard
from keras.constraints import unit_norm
from keras.layers import (Activation, AveragePooling2D, BatchNormalization, Conv1D, Conv2D, Conv2DTranspose, Conv3D,
                          Dense, Dropout, Flatten, GlobalAveragePooling1D, GlobalAveragePooling2D, GlobalAveragePooling3D,
                          GlobalMaxPooling1D, GlobalMaxPooling2D, GlobalMaxPooling3D, Input, Lambda, LeakyReLU, LocallyConnected1D,
                          LSTM, MaxPooling2D, MaxPooling3D, SeparableConv2D, SimpleRNN, TimeDistributed, UpSampling2D, UpSampling3D,
                          ZeroPadding2D, add, average, concatenate, maximum, multiply)
from keras.losses import binary_crossentropy
from keras.models import Model
from keras.optimizers import SGD
from keras.regularizers import l1
from keras.utils import OrderedEnqueuer, Sequence
from multiprocessing import Queue
from PIL import Image
from scipy.cluster.vq import kmeans
from scipy.stats import multivariate_normal
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import (Conv2D, GlobalAveragePooling2D, MaxPooling2D, Reshape, TimeDistributed)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.legacy import RMSprop
from tensorflow.keras.preprocessing.image import img_to_array, load_img

K.set_image_data_format('channels_last')

# **Utilities**

In [None]:

def get_valid_joints(x):
    return np.apply_along_axis(_func_and, axis=1, arr=(x > -1e6))


HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'

def printc(color, vmsg):
    print (color + vmsg + ENDC, end='')
    sys.stdout.flush()

def printcn(color, vmsg):
    print (color + vmsg + ENDC)
    sys.stdout.flush()

def printnl(vmsg):
    sys.stdout.write(vmsg + '\n')
    sys.stdout.flush()

def warning(vmsg):
    sys.stderr.write(WARNING + vmsg + ENDC + '\n')
    sys.stderr.flush()

def sprintcn(color, vmsg):
    return color + vmsg + ENDC + '\n'

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

def sizeof_eng_fmt(num):
    for unit in ['','K','M','G','T','P','E','Z']:
        if abs(num) < 1e3:
            return "%3.1f%s" % (num, unit)
        num /= 1e3
    return "%3.1f%s" % (num, 'Y')

relsize_std = 1.5
square_std = True

class PoseBBox():
    def __init__(self, poses, relsize=relsize_std, square=square_std):
        self.poses = poses
        self.relsize = relsize
        self.square = square
        if len(poses.shape) == 4:
            self.num_frames = poses.shape[1]
        else:
            self.num_frames = None

    def __getitem__(self, key):
        p = self.poses[key]
        if isinstance(key, int):
            return self._get_bbox(p)
        if isinstance(key, slice):
            indices = key.indices(len(self))
            key = range(*indices)
        x = np.zeros((len(key),) + self.shape[1:])
        for i in range(len(key)):
            x[i,:] = self._get_bbox(p[i])
        return x

    def _get_bbox(self, p):
        if self.num_frames is None:
            return get_valid_bbox(p, relsize=self.relsize, square=self.square)
        else:
            b = np.zeros(self.shape[1:])
            for f in range(self.num_frames):
                b[f, :] = get_valid_bbox(p[f], self.relsize, self.square)
            return b

    def __len__(self):
        return len(self.poses)

    @property
    def shape(self):
        if self.num_frames is None:
            return (len(self), 4)
        else:
            return (len(self), self.num_frames, 4)

def get_valid_bbox(points, jprob=None, relsize=relsize_std, square=square_std):
    if jprob is None:
        v = get_valid_joints(points)
    else:
        v = np.squeeze(jprob > 0.5)

    if v.any():
        x = points[v==1, 0]
        y = points[v==1, 1]
    else:
        raise ValueError('get_valid_bbox: all points are invalid!')
        # printcn(WARNING, 'All points are invalid! ' + str(points))
        # x = np.array([0.5])
        # y = np.array([0.5])

    cx = (min(x) + max(x)) / 2.
    cy = (min(y) + max(y)) / 2.
    rw = (relsize * (max(x) - min(x))) / 2.
    rh = (relsize * (max(y) - min(y))) / 2.
    if square:
        rw = max(rw, rh)
        rh = max(rw, rh)

    return np.array([cx - rw, cy - rh, cx + rw, cy + rh])

def get_valid_bbox_array(pointarray, jprob=None, relsize=relsize_std,
        square=square_std):

    bboxes = np.zeros((len(pointarray), 4))
    v = None
    for i in range(len(pointarray)):
        if jprob is not None:
            v = jprob[i]
        bboxes[i, :] = get_valid_bbox(pointarray[i], jprob=v,
                relsize=relsize, square=square)

    return bboxes

def get_objpos_winsize(points, relsize=relsize_std, square=square_std):
    x = points[:, 0]
    y = points[:, 1]
    cx = (min(x) + max(x)) / 2.
    cy = (min(y) + max(y)) / 2.
    w = relsize * (max(x) - min(x))
    h = relsize * (max(y) - min(y))
    if square:
        w = max(w, h)
        h = max(w, h)

    return np.array([cx, cy]), (w, h)

def compute_grid_bboxes(frame_size, grid=(3, 2),
        relsize=relsize_std,
        square=square_std):

    bb_cnt = 0
    num_bb = 2 + grid[0]*grid[1]
    bboxes = np.zeros((num_bb, 4))

    def _smax(a, b):
        if square:
            return max(a, b), max(a, b)
        return a, b

    # Compute the first two bounding boxes as the full frame + relsize
    cx = frame_size[0] / 2
    cy = frame_size[1] / 2
    rw, rh = _smax(cx, cy)
    bboxes[bb_cnt, :] = np.array([cx-rw, cy-rh, cx+rw, cy+rh])
    bb_cnt += 1

    rw *= relsize
    rh *= relsize
    bboxes[bb_cnt, :] = np.array([cx-rw, cy-rh, cx+rw, cy+rh])
    bb_cnt += 1

    winrw = frame_size[0] / (grid[0]+1)
    winrh = frame_size[1] / (grid[1]+1)
    rw, rh = _smax(winrw, winrh)

    for j in range(1, grid[1]+1):
        for i in range(1, grid[0]+1):
            cx = i * winrw
            cy = j * winrh
            bboxes[bb_cnt, :] = np.array([cx-rw, cy-rh, cx+rw, cy+rh])
            bb_cnt += 1

    return bboxes

def bbox_to_objposwin(bbox):
    cx = (bbox[0] + bbox[2]) / 2
    cy = (bbox[1] + bbox[3]) / 2
    wx = bbox[2] - bbox[0]
    wy = bbox[3] - bbox[1]

    return np.array([cx, cy]), (wx, wy)

def objposwin_to_bbox(objpos, winsize):
    x1 = objpos[0] - winsize[0]/2
    y1 = objpos[1] - winsize[1]/2
    x2 = objpos[0] + winsize[0]/2
    y2 = objpos[1] + winsize[1]/2

    return np.array([x1, y1, x2, y2])


logkey_warn = set()
def get_gt_bbox(pose, visible, image_size, scale=1.0, logkey=None):
    assert len(pose.shape) == 3 and pose.shape[-1] >= 2, \
            'Invalid pose shape ({})'.format(pose.shape) \
            + ', expected (num_frames, num_joints, dim) vector'
    assert len(pose) == len(visible), \
            'pose and visible should have the same langth'

    if len(pose) == 1:
        idx = [0]
    else:
        idx = [0, int(len(pose)/2 + 0.5), len(pose)-1]

    clip_bbox = np.array([np.inf, np.inf, -np.inf, -np.inf])

    for i in idx:
        temp = pose[i, visible[i] >= 0.5]
        if len(temp) == 0:
            temp = pose[i, pose[i] > 0]

        if len(temp) > 0:
            b = get_valid_bbox(temp, relsize=1.5*scale)

            clip_bbox[0] = min(b[0], clip_bbox[0])
            clip_bbox[1] = min(b[1], clip_bbox[1])
            clip_bbox[2] = max(b[2], clip_bbox[2])
            clip_bbox[3] = max(b[3], clip_bbox[3])
        else:
            if logkey not in logkey_warn:
                warning('No ground-truth bounding box, ' \
                        'using full image (key {})!'.format(logkey))
            logkey_warn.add(logkey)

            clip_bbox[0] = min(0, clip_bbox[0])
            clip_bbox[1] = min(0, clip_bbox[1])
            clip_bbox[2] = max(image_size[0], clip_bbox[2])
            clip_bbox[3] = max(image_size[1], clip_bbox[3])

    return clip_bbox


def get_crop_params(rootj, imgsize, f, scale):
    assert len(rootj.shape) == 2 and rootj.shape[-1] == 3, 'Invalid rootj ' \
            + 'shape ({}), expected (n, 3) vector'.format(rootj.shape)

    if len(rootj) == 1:
        idx = [0]
    else:
        idx = [0, int(len(rootj)/2 + 0.5), len(rootj)-1]

    x1 = y1 = np.inf
    x2 = y2 = -np.inf
    zrange = np.array([np.inf, -np.inf])
    for i in idx:
        objpos = np.array([rootj[0, 0], rootj[0, 1] + scale])
        d = rootj[0, 2]
        winsize = (2.25*scale)*max(imgsize[0]*f[0, 0]/d, imgsize[1]*f[0, 1]/d)
        bo = objposwin_to_bbox(objpos, (winsize, winsize))
        x1 = min(x1, bo[0])
        y1 = min(y1, bo[1])
        x2 = max(x2, bo[2])
        y2 = max(y2, bo[3])
        zrange[0] = min(zrange[0], d - scale*1000.)
        zrange[1] = max(zrange[1], d + scale*1000.)

    objpos, winsize = bbox_to_objposwin([x1, y1, x2, y2])

    return objpos, winsize, zrange

In [None]:

def transform_pose_sequence(A, poses, inverse=True):
    """For each pose in a sequence, apply the given affine transformation.

    # Arguments
        A: [3, 3] affine transformation matrix or
           [num_samples, 3, 3] matrices.
        poses: [num_samples, num_points, dim] vector of pose sequences.
        inverse: flag to apply the inverse transformation on A.

    # Return
        The transformed points.
    """

    assert (len(poses.shape) == 3), \
            'transform_pose_sequence: expected 3D tensor, got ' \
            + str(poses.shape)

    if len(A.shape) == 3:
        assert len(A) == len(poses), \
                'A is ' + str(A.shape) + ' and poses is ' + str(poses.shape)

    if inverse:
        if len(A.shape) == 3:
            for i in range(len(A)):
                A[i] = np.linalg.inv(A[i])
        else:
            A = np.linalg.inv(A)

    y = np.empty(poses.shape)
    for j in range(len(poses)):
        if len(A.shape) == 3:
            y[j, :, :] = transform_2d_points(A[j], poses[j], transpose=True)
        else:
            y[j, :, :] = transform_2d_points(A, poses[j], transpose=True)

    return y

class Camera(object):
    """Camera implementation.

    # Arguments
        R: Rotation matrix (3,3)
        t: Translation vector world coordinate system (3, 1)
        f: Focal length (1, 2)
        c: Principal point (1, 2)
        p: Skew (1, 2)
        k: Distortion coefficients (3,), frequently not required.

    # TODO
        Implement distortion coefficients.
    """

    def __init__(self, R, t, f, c, p, k=None):
        self.R = R
        self.R_inv = np.linalg.inv(self.R)
        self.t = np.reshape(t, (3, 1))
        self.f = np.reshape(f, (1, 2))
        self.c = np.reshape(c, (1, 2))
        self.p = np.reshape(p, (1, 2))
        self.k = k
        if self.k is not None:
            self.k = np.reshape(self.k, (3,))

    def project(self, points_w):
        """Projects world 3D points (in millimeters) to the image coordinate
        system (in x-y pixels and depth).
        """
        assert len(points_w.shape) == 2 and points_w.shape[1] == 3, \
                'Invalid shape for points_w ({}),'.format(points_w.shape) \
                + 'expected (n, 3)'

        x = np.matmul(self.R, points_w.T - self.t).T
        x[:, 0:2] /= x[:, 2:3]

        if self.k is not None:
            r2, radial, tan = get_r2_radial_tan(x[:, 0:2], self.k, self.p)
            x[:, 0:2] *= np.expand_dims(radial + tan, axis=-1)
            x[:, 0:2] += np.dot(np.expand_dims(r2, axis=-1), self.p)

        x[:, 0:2] = x[:, 0:2]*self.f + self.c

        return x

    def inverse_project(self, points_uvd):
        """Projects a point in the camera coordinate system (x-y in pixels and
        depth) to world 3D coordinates (in millimeters).
        """
        assert len(points_uvd.shape) == 2 and points_uvd.shape[1] == 3, \
                'Invalid shape for points_uvd ({}),'.format(points_uvd.shape) \
                + ' expected (n, 3)'

        x = points_uvd.copy()
        x[:, 0:2] = (x[:, 0:2] - self.c) / self.f

        if self.k is not None:
            r2, radial, tan = get_r2_radial_tan(x[:, 0:2], self.k, self.p)
            x[:, 0:2] -= np.dot(np.expand_dims(r2, axis=-1), self.p)
            x[:, 0:2] /= np.expand_dims(radial + tan, axis=-1)

        x[:, 0:2] *= x[:, 2:3]
        x = (np.matmul(self.R_inv, x.T) + self.t).T

        return x

    def serialize(self):
        s = np.array(self.R).reshape((9,))
        s = np.concatenate([s, np.array(self.t).reshape((3,))])
        s = np.concatenate([s, np.array(self.f).reshape((2,))])
        s = np.concatenate([s, np.array(self.c).reshape((2,))])
        s = np.concatenate([s, np.array(self.p).reshape((2,))])
        if self.k is not None:
            s = np.concatenate([s, self.k])

        return s

def get_r2_radial_tan(x, k, p):
    """Given a set o points x [num_points, 2] in the image coordinate system,
    compute the required vectors to apply the distortion coefficients.
    """
    assert x.ndim == 2 and x.shape[1] == 2
    assert k.shape == (3,) and p.shape == (1, 2)

    r2 = np.power(x[:, 0], 2) + np.power(x[:, 1], 2)
    radial = 1. + r2*k[0] + np.power(r2, 2)*k[1] + np.power(r2, 3)*k[2]
    tan = np.sum(x * p, axis=-1)

    return r2, radial, tan


def camera_deserialize(s):
    R, s = np.split(s, [9])
    t, s = np.split(s, [3])
    f, s = np.split(s, [2])
    c, s = np.split(s, [2])
    p, s = np.split(s, [2])

    k = None
    if len(s) > 0:
        k, s = np.split(s, [3])

    return Camera(np.reshape(R, (3, 3)), t, f, c, p, k)


def project_pred_to_camera(pred, afmat, resol_z, root_z):
    num_samples, num_joints, dim = pred.shape
    root_z = np.expand_dims(root_z, axis=-1)

    proj = np.zeros(pred.shape)
    proj[:,:,0:2] = transform_pose_sequence(afmat, pred[:,:,0:2], inverse=True)
    proj[:,:,2] = (resol_z * (pred[:,:,2] - 0.5)) + root_z

    return proj


In [None]:

def most_assigned(x, c):
    nb_c = len(c)
    assign = np.zeros(nb_c)
    for i in range(len(x)):
        y = x[i].reshape((1,2))
        d = np.sqrt(np.sum(np.power(y.repeat(nb_c, axis=0) - c, 2), axis=1))
        assign[d.argmin()] += 1
    return assign.argmax()

def mean_on_most_assigned(x, c):
    nb_c = len(c)
    assign = np.zeros(nb_c)
    mean = np.zeros(c.shape)
    for i in range(len(x)):
        y = x[i].reshape((1,2))
        d = np.sqrt(np.sum(np.power(y.repeat(nb_c, axis=0) - c, 2), axis=1))
        idx = d.argmin()
        assign[idx] += 1
        mean[idx,:] += x[i]
    idx = assign.argmax()
    return mean[idx,:] / assign[idx]

# def best_kmeans(pred):
    # plt.scatter(pred[:,0], pred[:,1], color='b')
    # c,v = kmeans(pred, 3)
    # plt.scatter(c[:,0], c[:,1], color='g')
    # n = most_assigned(pred, c)
    # plt.scatter(c[n,0], c[n,1], color='r')
    # plt.show()

def clustering_joints(y_pred, k=3):
    _,nb_spl,nb_joints,dim = y_pred.shape
    y = np.zeros((nb_spl, nb_joints, dim))
    for s in range(nb_spl):
        for j in range(nb_joints):
            d = y_pred[:,s,j]
            c,v = kmeans(d, k)
            n = most_assigned(d, c)
            y[s,j,:] = c[n]
    return y

def clustering_grid(y_pred, size=10):
    _, nb_spl, nb_joints, dim = y_pred.shape
    assert dim == 2
    yp = np.zeros((nb_spl, nb_joints, dim))
    for s in range(nb_spl):
        for j in range(nb_joints):
            d = y_pred[:,s,j,:]
            xmin = d[:,0].min()
            ymin = d[:,1].min()
            xmax = d[:,0].max()
            ymax = d[:,1].max()
            xstep = (xmax - xmin) / size
            ystep = (ymax - ymin) / size
            c = np.zeros((size * size, dim))
            for x in range(size):
                for y in range(size):
                    c[x + size*y, 0] = xmin + (x + 0.5) * xstep
                    c[x + size*y, 1] = ymin + (y + 0.5) * ystep
            yp[s,j,:] = mean_on_most_assigned(d, c)
    return yp

def mean_joints(y_pred):
    _, nb_spl, dim, nb_joints = y_pred.shape
    assert dim == 2
    yp = np.zeros((nb_spl, dim, nb_joints))
    for s in range(nb_spl):
        for j in range(nb_joints):
            d = y_pred[:,s,:,j]
            yp[s, 0, j] = d[:,0].mean()
            yp[s, 1, j] = d[:,1].mean()
    return yp


In [None]:

cnames = {
        'aliceblue':            '#F0F8FF',
        'antiquewhite':         '#FAEBD7',
        'aqua':                 '#00FFFF',
        'aquamarine':           '#7FFFD4',
        'azure':                '#F0FFFF',
        'beige':                '#F5F5DC',
        'bisque':               '#FFE4C4',
        'black':                '#000000',
        'blanchedalmond':       '#FFEBCD',
        'blue':                 '#0000FF',
        'blueviolet':           '#8A2BE2',
        'brown':                '#A52A2A',
        'burlywood':            '#DEB887',
        'cadetblue':            '#5F9EA0',
        'chartreuse':           '#7FFF00',
        'chocolate':            '#D2691E',
        'coral':                '#FF7F50',
        'cornflowerblue':       '#6495ED',
        'cornsilk':             '#FFF8DC',
        'crimson':              '#DC143C',
        'cyan':                 '#00FFFF',
        'darkblue':             '#00008B',
        'darkcyan':             '#008B8B',
        'darkgoldenrod':        '#B8860B',
        'darkgray':             '#A9A9A9',
        'darkgreen':            '#006400',
        'darkkhaki':            '#BDB76B',
        'darkmagenta':          '#8B008B',
        'darkolivegreen':       '#556B2F',
        'darkorange':           '#FF8C00',
        'darkorchid':           '#9932CC',
        'darkred':              '#8B0000',
        'darksalmon':           '#E9967A',
        'darkseagreen':         '#8FBC8F',
        'darkslateblue':        '#483D8B',
        'darkslategray':        '#2F4F4F',
        'darkturquoise':        '#00CED1',
        'darkviolet':           '#9400D3',
        'deeppink':             '#FF1493',
        'deepskyblue':          '#00BFFF',
        'dimgray':              '#696969',
        'dodgerblue':           '#1E90FF',
        'firebrick':            '#B22222',
        'floralwhite':          '#FFFAF0',
        'forestgreen':          '#228B22',
        'fuchsia':              '#FF00FF',
        'gainsboro':            '#DCDCDC',
        'ghostwhite':           '#F8F8FF',
        'gold':                 '#FFD700',
        'goldenrod':            '#DAA520',
        'gray':                 '#808080',
        'green':                '#008000',
        'greenyellow':          '#ADFF2F',
        'honeydew':             '#F0FFF0',
        'hotpink':              '#FF69B4',
        'indianred':            '#CD5C5C',
        'indigo':               '#4B0082',
        'ivory':                '#FFFFF0',
        'khaki':                '#F0E68C',
        'lavender':             '#E6E6FA',
        'lavenderblush':        '#FFF0F5',
        'lawngreen':            '#7CFC00',
        'lemonchiffon':         '#FFFACD',
        'lightblue':            '#ADD8E6',
        'lightcoral':           '#F08080',
        'lightcyan':            '#E0FFFF',
        'lightgoldenrodyellow': '#FAFAD2',
        'lightgreen':           '#90EE90',
        'lightgray':            '#D3D3D3',
        'lightpink':            '#FFB6C1',
        'lightsalmon':          '#FFA07A',
        'lightseagreen':        '#20B2AA',
        'lightskyblue':         '#87CEFA',
        'lightslategray':       '#778899',
        'lightsteelblue':       '#B0C4DE',
        'lightyellow':          '#FFFFE0',
        'lime':                 '#00FF00',
        'limegreen':            '#32CD32',
        'linen':                '#FAF0E6',
        'magenta':              '#FF00FF',
        'maroon':               '#800000',
        'mediumaquamarine':     '#66CDAA',
        'mediumblue':           '#0000CD',
        'mediumorchid':         '#BA55D3',
        'mediumpurple':         '#9370DB',
        'mediumseagreen':       '#3CB371',
        'mediumslateblue':      '#7B68EE',
        'mediumspringgreen':    '#00FA9A',
        'mediumturquoise':      '#48D1CC',
        'mediumvioletred':      '#C71585',
        'midnightblue':         '#191970',
        'mintcream':            '#F5FFFA',
        'mistyrose':            '#FFE4E1',
        'moccasin':             '#FFE4B5',
        'navajowhite':          '#FFDEAD',
        'navy':                 '#000080',
        'oldlace':              '#FDF5E6',
        'olive':                '#808000',
        'olivedrab':            '#6B8E23',
        'orange':               '#FFA500',
        'orangered':            '#FF4500',
        'orchid':               '#DA70D6',
        'palegoldenrod':        '#EEE8AA',
        'palegreen':            '#98FB98',
        'paleturquoise':        '#AFEEEE',
        'palevioletred':        '#DB7093',
        'papayawhip':           '#FFEFD5',
        'peachpuff':            '#FFDAB9',
        'peru':                 '#CD853F',
        'pink':                 '#FFC0CB',
        'plum':                 '#DDA0DD',
        'powderblue':           '#B0E0E6',
        'purple':               '#800080',
        'red':                  '#FF0000',
        'rosybrown':            '#BC8F8F',
        'royalblue':            '#4169E1',
        'saddlebrown':          '#8B4513',
        'salmon':               '#FA8072',
        'sandybrown':           '#FAA460',
        'seagreen':             '#2E8B57',
        'seashell':             '#FFF5EE',
        'sienna':               '#A0522D',
        'silver':               '#C0C0C0',
        'skyblue':              '#87CEEB',
        'slateblue':            '#6A5ACD',
        'slategray':            '#708090',
        'snow':                 '#FFFAFA',
        'springgreen':          '#00FF7F',
        'steelblue':            '#4682B4',
        'tan':                  '#D2B48C',
        'teal':                 '#008080',
        'thistle':              '#D8BFD8',
        'tomato':               '#FF6347',
        'turquoise':            '#40E0D0',
        'violet':               '#EE82EE',
        'wheat':                '#F5DEB3',
        'white':                '#FFFFFF',
        'whitesmoke':           '#F5F5F5',
        'yellow':               '#FFFF00',
        'yellowgreen':          '#9ACD32'
}

hex_colors = []
for name in cnames:
    hex_colors.append(cnames[name])

def hexcolor2tuple(s):
    return (int(s[1:3], 16)/255., int(s[3:5], 16)/255., int(s[5:7], 16)/255.)


In [None]:

def mkdir(path):
    if os.path.isdir(path) is False:
        os.mkdir(path)


In [None]:

def normalpdf2d(numbins, xmean, ymean, var):
    lin = np.linspace(0, numbins-1, numbins)

    # Produce a gaussian in X and in Y
    x = multivariate_normal.pdf(lin, mean=xmean, cov=var)
    x = x.reshape((1, numbins)).repeat(numbins, axis=0)
    y = multivariate_normal.pdf(lin, mean=ymean, cov=var)
    y = y.reshape((numbins, 1)).repeat(numbins, axis=1)
    g = x * y

    if g.sum() > K.epsilon():
        return g / g.sum()

    return np.zeros(g.shape)
def get_visible_joints(x, margin=0.0):

    visible = np.apply_along_axis(_func_and, axis=1, arr=(x > margin))
    visible *= np.apply_along_axis(_func_and, axis=1, arr=(x < 1 - margin))

    return visible

class HeatMaps2D():
    def __init__(self, poses, numbins, variance=0.3):
        assert (poses.shape[-1] == 2) or ((poses.shape[-1] == 3)), \
                'Poses are expected to by 2D or 3D!'
        self.poses = poses
        if len(poses.shape) == 4:
            self.num_frames = poses.shape[1]
        else:
            self.num_frames = None

        self.numbins = numbins
        self.variance = variance
        self.num_joints = int(poses.shape[-2])

    def __getitem__(self, key):
        p = self.poses[key]
        if isinstance(key, int):
            return pose_heatmaps(p, self.numbins, self.num_joints,
                    variance=self.variance, num_frames=self.num_frames)
        if isinstance(key, slice):
            indices = key.indices(len(self))
            key = range(*indices)
        x = np.zeros((len(key),) + self.shape[1:])
        for i in range(len(key)):
            x[i,:] = pose_heatmaps(p[i], self.numbins, self.num_joints,
                    variance=self.variance, num_frames=self.num_frames)
        return x


    def __len__(self):
        return len(self.poses)

    @property
    def shape(self):
        if self.num_frames is None:
            return (len(self),) + (self.numbins, self.numbins, self.num_joints)
        else:
            return (len(self),) + (self.num_frames,
                    self.numbins, self.numbins, self.num_joints)


def pose_heatmaps(p, num_bins, num_joints, variance=0.1, num_frames=None):
    if num_frames is None:
        h = np.zeros((num_bins, num_bins, num_joints))
        v = get_visible_joints(p[:, 0:2])
        points = num_bins * p[:, 0:2]
        for j in range(num_joints):
            if v[j]:
                h[:,:,j] = normalpdf2d(num_bins,
                        points[j,0], points[j,1], variance)
    else:
        h = np.zeros((num_frames, num_bins, num_bins, num_joints))
        for f in range(num_frames):
            v = get_visible_joints(p[f][:, 0:2])
            points = num_bins * p[f][:, 0:2]
            for j in range(num_joints):
                if v[j]:
                    h[f,:,:,j] = normalpdf2d(num_bins,
                            points[j,0], points[j,1], variance)
    return h


In [None]:

def linspace_2d(nb_rols, nb_cols, dim=0):

    def _lin_sp_aux(size, nb_repeat, start, end):
        linsp = np.linspace(start, end, num=size)
        x = np.empty((nb_repeat, size), dtype=np.float32)

        for d in range(nb_repeat):
            x[d] = linsp

        return x

    if dim == 1:
        return (_lin_sp_aux(nb_rols, nb_cols, 0.0, 1.0)).T
    return _lin_sp_aux(nb_cols, nb_rols, 0.0, 1.0)


In [None]:

TEST_MODE = 0
TRAIN_MODE = 1
VALID_MODE = 2


class BaseParser(object):

    compute_dataset_info = True
    avg_num_frames = 0
    pose_min = np.array([np.inf, np.inf, np.inf])
    pose_max = np.array([-np.inf, -np.inf, -np.inf])

    def __init__(self, fid):
        self.fid = fid

    def getattr(self):
        line = self.read_valid_line()
        if line is None:
            raise EOFError('File consumed!')

        return self.get_par_attr(line)

    def get_par_attr(self, line):
        val = line.split(':')
        assert len(val) == 2, 'Wrong "parameter: attributes," content'

        attr = val[1].split(',')
        if ((attr[-1] == '') or (attr[-1] == '\n')):
            del attr[-1]

        for i in range(len(attr)):
            attr[i] = attr[i].strip()

        return val[0], attr


    def read_valid_line(self):
        while True:
            line = self.fid.readline()
            if line:
                s = line.decode('utf-8')
                if ((s[0] != '\0') and (s[0] != '\n') and (s[0] != '#')):
                    return s
            else:
                return None

class BaseElement(object):
    def __init__(self, parent=None):
        self.parent = parent

    def copy(self):
        return copy.deepcopy(self)

    def get_camera(self):
        if hasattr(self, 'camera'):
            return self.camera
        elif hasattr(self, 'parent'):
            if self.parent is not None:
                return self.parent.get_camera()
        return None


class Entity(BaseElement):
    def __init__(self, parser, dim, num_joints, parent):
        BaseElement.__init__(self, parent=parent)
        self.pos = np.nan * np.ones((num_joints, dim))
        self.vis = np.nan * np.ones((num_joints, 1))
        self.mode = -1

        while True:
            par, attr = parser.getattr()

            if par == 'action_id':
                self.action_id = int(attr[0])

            if par == 'viewpoint_id':
                self.viewpoint_id = int(attr[0])

            if par == 'scale':
                self.scale = float(attr[0])

            if par == 'objpos':
                assert 2 == len(attr)
                self.objpos = np.array([float(attr[0]), float(attr[1])])

            if par == 'head':
                assert 4 == len(attr)
                self.head = np.array([float(attr[0]), float(attr[1]),
                                      float(attr[2]), float(attr[3])])

            if par == 'x':
                assert num_joints == len(attr)
                for i in range(num_joints):
                    self.pos[i, 0] = float(attr[i])

            if par == 'y':
                assert num_joints == len(attr)
                for i in range(num_joints):
                    self.pos[i, 1] = float(attr[i])

            if par == 'z':
                assert num_joints == len(attr)
                for i in range(num_joints):
                    self.pos[i, 2] = float(attr[i])

            if par == 'v':
                assert num_joints == len(attr)
                for i in range(num_joints):
                    self.vis[i, 0] = float(attr[i])

            if par == 'mode':
                self.mode = int(attr[0])
                break

        if BaseParser.compute_dataset_info:
            pmin = np.nan * np.ones((3,))
            pmax = np.nan * np.ones((3,))
            pmin[0:dim] = np.nanmin(self.pos, axis=0)
            pmax[0:dim] = np.nanmax(self.pos, axis=0)
            BaseParser.pose_min = \
                    np.nanmin(np.array([pmin, BaseParser.pose_min]), axis=0)
            BaseParser.pose_max = \
                    np.nanmax(np.array([pmax, BaseParser.pose_max]), axis=0)


class ImageFrame(BaseElement):
    def __init__(self, parser, dim, num_joints, parent=None):
        BaseElement.__init__(self, parent=parent)
        self.mode = -1

        while True:
            par, attr = parser.getattr()

            if par == 'image':
                self.image = attr[0]

            if par == 'res':
                self.res = np.array([float(attr[0]), float(attr[1])])

            if par == 'num_ent':
                self.num_ent = int(attr[0])
                self.entities = []
                for i in range(self.num_ent):
                    self.entities.append(
                            Entity(parser, dim, num_joints, parent=self))

            if par == 'mode':
                self.mode = int(attr[0])
                break


class SequenceOld(BaseElement):
    def __init__(self, parser, dim, num_joints):
        BaseElement.__init__(self)
        self.mode = -1

        while True:
            par, attr = parser.getattr()

            # if par == 'camera_parameters':
                # self.camera = Camera(attr)

            if par == 'num_frames':
                self.num_frames = int(attr[0])
                if BaseParser.compute_dataset_info:
                    BaseParser.avg_num_frames += self.num_frames
                self.frames = []
                for i in range(self.num_frames):
                    self.frames.append(ImageFrame(parser, dim, num_joints,
                        parent=self))

            if par == 'mode':
                self.mode = int(attr[0])
                break


def std_dat_parser(anno_obj, fid):
    parser = BaseParser(fid)

    while True:
        try:
            par, attr = parser.getattr()
        except Exception as e:
            print ('std_dat_parser: ' + str(e))
            break

        if par == 'action_labels':
            anno_obj.action_labels = attr

        if par == 'joint_labels':
            anno_obj.joint_labels = attr

        if par == 'viewpoint_labels':
            anno_obj.viewpoint_labels = attr

        if par == 'num_joints':
            anno_obj.num_joints = int(attr[0])

        if par == 'dim':
            anno_obj.dim = int(attr[0])

        if par == 'num_sequences':
            anno_obj.num_sequences = int(attr[0])
            anno_obj.sequences = []
            for i in range(anno_obj.num_sequences):
                anno_obj.sequences.append(
                        SequenceOld(parser, anno_obj.dim, anno_obj.num_joints))
            BaseParser.avg_num_frames /= len(anno_obj.sequences)

class Annotation(object):
    def __init__(self, dataset_path=None, custom_parser=None):
        self.sequences = []
        if custom_parser is None:
            assert dataset_path, \
                    "If a custom parser is not given, dataset_path is required"
        try:
            if custom_parser is not None:
                self.action_labels, \
                        self.joint_labels, \
                        self.viewpoint_labels,\
                        self.sequences = custom_parser(dataset_path)
            else:
                # Standard parser
                filename = '%s/annotations.dat.gz' % dataset_path
                fid = gzip.open(filename, 'r')
                gz_header = fid.readline()
                std_dat_parser(self, fid)
                fid.close()

            if BaseParser.compute_dataset_info:
                printcn(HEADER, '## Info on dataset "%s" ##' % dataset_path)
                printcn(OKBLUE, '  Average number of frames: %.0f' % \
                        BaseParser.avg_num_frames)
                printcn(OKBLUE, '  Min pose values on X-Y-Z: {}'.format(
                    BaseParser.pose_min))
                printcn(OKBLUE, '  Max pose values on X-Y-Z: {}'.format(
                    BaseParser.pose_max))

        except Exception as e:
            print ('Catch exception in Annotation class: ' + str(e))


def appstr(s, a):
    """Safe appending strings."""
    try:
        return s + a
    except:
        return None


In [None]:

class _pa16j():
    """Pose alternated with 16 joints (like Penn Action with three more
    joints on the spine.
    """
    num_joints = 16
    joint_names = ['pelvis', 'thorax', 'neck', 'head',
            'r_shoul', 'l_shoul', 'r_elb', 'l_elb', 'r_wrist', 'l_wrist',
            'r_hip', 'l_hip', 'r_knww', 'l_knee', 'r_ankle', 'l_ankle']

    """Horizontal flip mapping"""
    map_hflip = [0, 1, 2, 3, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]

    """Projections from other layouts to the PA16J standard"""
    map_from_mpii = [6, 7, 8, 9, 12, 13, 11, 14, 10, 15, 2, 3, 1, 4, 0, 5]
    map_from_ntu = [0, 20, 2, 3, 4, 8, 5, 9, 6, 10, 12, 16, 13, 17, 14, 18]

    """Projections of PA16J to other formats"""
    map_to_pa13j = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    map_to_jhmdb = [2, 1, 3, 4, 5, 10, 11, 6, 7, 12, 13, 8, 9, 14, 15]
    map_to_mpii = [14, 12, 10, 11, 13, 15, 0, 1, 2, 3, 8, 6, 4, 5, 7, 9]
    map_to_lsp = [14, 12, 10, 11, 13, 15, 8, 6, 4, 5, 7, 9, 2, 3]

    """Color map"""
    color = ['g', 'r', 'b', 'y', 'm']
    cmap = [0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4]
    links = [[0, 1], [1, 2], [2, 3], [4, 6], [6, 8], [5, 7], [7, 9],
            [10, 12], [12, 14], [11, 13], [13, 15]]

class _pa17j():
    """Pose alternated with 17 joints (like _pa16j, with the middle spine).
    """
    num_joints = 17

    """Horizontal flip mapping"""
    map_hflip = [0, 1, 2, 3, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 16]

    """Projections from other layouts to the PA17J standard"""
    map_from_h36m = \
            [0, 12, 13, 15, 25, 17, 26, 18, 27, 19, 1, 6, 2, 7, 3, 8, 11]
    map_from_ntu = _pa16j.map_from_ntu + [1]
    map_from_mpii3dhp = \
            [4, 5, 6, 7, 14, 9, 15, 10, 16, 11, 23, 18, 24, 19, 25, 20, 3]
    map_from_mpii3dhp_te = \
            [14, 1, 16, 0, 2, 5, 3, 6, 4, 7, 8, 11, 9, 12, 10, 13, 15]

    """Projections of PA17J to other formats"""
    map_to_pa13j = _pa16j.map_to_pa13j
    map_to_mpii = [14, 12, 10, 11, 13, 15, 0, 1, 2, 3, 8, 6, 4, 5, 7, 9]
    map_to_pa16j = list(range(16))

    """Color map"""
    color = ['g', 'r', 'b', 'y', 'm']
    cmap = [0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 0]
    links = [[0, 16], [16, 1], [1, 2], [2, 3], [4, 6], [6, 8], [5, 7], [7, 9],
            [10, 12], [12, 14], [11, 13], [13, 15]]

class _pa20j():
    """Pose alternated with 20 joints. Similar to _pa16j, but with one more
    joint for hands and feet.
    """
    num_joints = 20

    """Horizontal flip mapping"""
    map_hflip = [0, 1, 2, 3, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16,
            19, 18]

    """Projections from other layouts to the PA20J standard"""
    map_from_h36m = [0, 12, 13, 15, 25, 17, 26, 18, 27, 19, 30, 22, 1, 6, 2,
            7, 3, 8, 4, 9]
    map_from_ntu = [0, 20, 2, 3, 4, 8, 5, 9, 6, 10, 7, 11, 12, 16, 13, 17, 14,
            18, 15, 19]

    """Projections of PA20J to other formats"""
    map_to_mpii = [16, 14, 12, 13, 15, 17, 0, 1, 2, 3, 8, 6, 4, 5, 7, 9]
    map_to_pa13j = [3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17]
    map_to_pa16j = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17]

    """Color map"""
    color = ['g', 'r', 'b', 'y', 'm']
    cmap = [0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4]
    links = [[0, 1], [1, 2], [2, 3], [4, 6], [6, 8], [8, 10], [5, 7], [7, 9],
            [9, 11], [12, 14], [14, 16], [16, 18], [13, 15], [15, 17], [17, 19]]

class _pa21j():
    """Pose alternated with 21 joints. Similar to _pa20j, but with one more
    joint referent to the 16th joint from _pa17j, for compatibility with H36M.
    """
    num_joints = 21

    """Horizontal flip mapping"""
    map_hflip = _pa20j.map_hflip + [20]

    """Projections from other layouts to the PA21J standard"""
    map_from_h36m = _pa20j.map_from_h36m + [11]
    map_from_ntu = _pa20j.map_from_ntu + [1]

    """Projections of PA20J to other formats"""
    map_to_mpii = _pa20j.map_to_mpii
    map_to_pa13j = _pa20j.map_to_pa13j
    map_to_pa16j = _pa20j.map_to_pa16j
    map_to_pa17j = _pa20j.map_to_pa16j + [20]

    """Color map"""
    color = ['g', 'r', 'b', 'y', 'm']
    cmap = [0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 0]
    links = [[0, 20], [20, 1], [1, 2], [2, 3], [4, 6], [6, 8], [8, 10], [5, 7], [7, 9],
            [9, 11], [12, 14], [14, 16], [16, 18], [13, 15], [15, 17], [17, 19]]

class coco17j():
    """Original layout for the MS COCO dataset."""
    num_joints = 17
    dim = 2

    """Horizontal flip mapping"""
    map_hflip = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]

    """Color map"""
    color = ['g', 'r', 'b', 'y', 'm', 'w']
    cmap = [0, 0, 0, 5, 5, 0, 0, 2, 1, 2, 1, 0, 0, 4, 3, 4, 3]
    links = [[13, 15], [13, 11], [14, 16], [14, 12], [11, 12], [5, 11], [6,
        12], [5, 6], [7, 5], [8, 6], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2],
        [3, 1], [4, 2], [3, 5], [4, 6]]





def _func_and(x):
    if x.all():
        return 1
    return 0





def convert_pa17j3d_to_pa16j(p, dim=3):
    assert p.shape == (pa17j3d.num_joints, pa17j3d.dim)
    return p[pa17j3d.map_to_pa16j,0:dim].copy()

def convert_sequence_pa17j3d_to_pa16j(seqp, dim=3):
    assert seqp.shape[1:] == (pa17j3d.num_joints, pa17j3d.dim)
    x = np.zeros((len(seqp), _pa16j.num_joints, dim))
    for i in range(len(seqp)):
        x[i,:] = convert_pa17j3d_to_pa16j(seqp[i], dim=dim)
    return x

def write_poselist(filename, poses):
    """ Write a pose list to a text file.
    In the text file, every row corresponds to one pose and the columns are:
    {x1, y1, x2, y2, ...}

        Inputs: 'filename'
                'poses' [nb_samples, nb_joints, 2]
    """
    nb_samples, nb_joints, dim = poses.shape
    x = poses.copy()
    x = np.reshape(x, (nb_samples, nb_joints * dim))
    np.savetxt(filename, x, fmt='%.6f', delimiter=',')

def assign_knn_confidence(c, num_iter=2):
    assert c.ndim == 2 and c.shape[1] == 1, \
            'Invalid confidence shape {}'.format(c.shape)

    def _search_knn(refp):
        cs = c[list(refp), 0]
        if np.isnan(cs).all():
            return np.nan
        if np.nanmean(cs) < 0.5:
            return 0.1
        return 0.9

    for _ in range(num_iter):
        for i in range(len(c)):
            if np.isnan(c[i, 0]):
                c[i, 0] = _search_knn(dsl80j3d.neighbors[i])

class pa16j2d(_pa16j):
    dim = 2

class pa16j3d(_pa16j):
    dim = 3

class pa17j2d(_pa17j):
    dim = 2

class pa17j3d(_pa17j):
    dim = 3

class pa20j3d(_pa20j):
    dim = 3

class pa21j3d(_pa21j):
    dim = 3

class ntu25j3d():
    num_joints = 25
    dim = 3

try:
    from mpl_toolkits.mplot3d import Axes3D
    import matplotlib.pyplot as plt
except Exception as e:
    printcn(FAIL, str(e))
    plt = None


def data_to_image(x, gray_scale=False):
    """ Convert 'x' to a RGB Image object.

    # Arguments
        x: image in the format (num_cols, num_rows, 3) for RGB images or
            (num_cols, num_rows) for gray scale images. If None, return a
            light gray image with size 100x100.
        gray_scale: convert the RGB color space to a RGB gray scale space.
    """

    if x is None:
        x = 224 * np.ones((100, 100, 3), dtype=np.uint8)

    if x.max() - x.min() > 0.:
        buf = 255. * (x - x.min()) / (x.max() - x.min())
    else:
        buf = x.copy()

    if len(buf.shape) == 3:
        (w, h) = buf.shape[0:2]
        num_ch = buf.shape[2]
    else:
        (h, w) = buf.shape
        num_ch = 1

    if ((num_ch is 3) and gray_scale):
        g = 0.2989*buf[:,:,0] + 0.5870*buf[:,:,1] + 0.1140*buf[:,:,2]
        buf[:,:,0] = g
        buf[:,:,1] = g
        buf[:,:,2] = g
    elif num_ch is 1:
        aux = np.zeros((h, w, 3), dtype=buf.dtype)
        aux[:,:,0] = buf
        aux[:,:,1] = buf
        aux[:,:,2] = buf
        buf = aux

    return Image.fromarray(buf.astype(np.uint8), 'RGB')


def show(x, gray_scale=False, jet_cmap=False, filename=None):
    """ Show 'x' as an image on the screen.
    """
    if jet_cmap is False:
        img = data_to_image(x, gray_scale=gray_scale)
    else:
        if plt is None:
            printcn(WARNING, 'pyplot not defined!')
            return
        cmap = plt.cm.jet
        norm = plt.Normalize(vmin=x.min(), vmax=x.max())
        img = cmap(norm(x))
    if filename:
        plt.imsave(filename, img)
    else:
        plt.imshow(img)
        plt.show()


def draw(x=None,
        skels=[],
        bboxes=[],
        bbox_color='g',
        abs_pos=False,
        plot3d=False,
        single_window=False,
        figsize=(16,9),
        axis='on',
        facecolor='white',
        azimuth=65,
        dpi=100,
        filename=None):

    # Configure the ploting environment
    if plt is None:
        printcn(WARNING, 'pyplot not defined!')
        return

    """ Plot 'x' and draw over it the skeletons and the bounding boxes.
    """
    img = data_to_image(x)
    if abs_pos:
        w = None
        h = None
    else:
        w,h = img.size

    def add_subimage(f, subplot, img):
        ax = f.add_subplot(subplot)
        plt.imshow(img, zorder=-1)
        return ax

    fig = [plt.figure(figsize=figsize)]
    ax = []

    if plot3d:
        if single_window:
            ax.append(add_subimage(fig[0], 121, img))
            ax.append(fig[0].add_subplot(122, projection='3d'))
        else:
            ax.append(add_subimage(fig[0], 111, img))
            fig.append(plt.figure(figsize=figsize))
            ax.append(fig[1].add_subplot(111, projection='3d'))
    else:
        ax.append(add_subimage(fig[0], 111, img))

    plt.axis(axis)

    # Plotting skeletons if not None
    if skels is not None:
        if isinstance(skels, list) or len(skels.shape) == 3:
            for s in skels:
                plot_skeleton_2d(ax[0], s, h=h, w=w)
            if plot3d:
                plot_3d_pose(s, subplot=ax[-1], azimuth=azimuth)
        else:
            plot_skeleton_2d(ax[0], skels, h=h, w=w)
            if plot3d:
                plot_3d_pose(skels, subplot=ax[-1], azimuth=azimuth)

    # Plotting bounding boxes if not None
    if bboxes is not None:
        if isinstance(bboxes, list) or len(bboxes.shape) == 3:
            for b, c in zip(bboxes, bbox_color):
                _plot_bbox(ax[0], b, h=h, w=w, c=c, lw=4)
        else:
            _plot_bbox(ax[0], bboxes, h=h, w=w, c=bbox_color, lw=4)


    if filename:
        fig[0].savefig(filename, bbox_inches='tight', pad_inches=0,
                facecolor=facecolor, dpi=dpi)
        if plot3d and (single_window is False):
            fig[-1].savefig(filename + '.eps',
                    bbox_inches='tight', pad_inches=0)
    else:
        plt.show()

    for i in range(len(fig)):
        plt.close(fig[i])


def _get_poselayout(num_joints):
    if num_joints == 16:
        return pa16j2d.color, pa16j2d.cmap, pa16j2d.links
    elif num_joints == 17:
        return pa17j3d.color, pa17j3d.cmap, pa17j3d.links
    elif num_joints == 20:
        return pa20j3d.color, pa20j3d.cmap, pa20j3d.links


def plot_3d_pose(pose, subplot=None, filename=None, color=None, lw=3,
        azimuth=65):

    if plt is None:
        raise Exception('"matplotlib" is required for 3D pose plotting!')

    num_joints, dim = pose.shape
    assert dim in [2, 3], 'Invalid pose dimension (%d)' % dim
    assert ((num_joints == 16) or (num_joints == 17)) or (num_joints == 20), \
            'Unsupported number of joints (%d)' % num_joints

    col, cmap, links = _get_poselayout(num_joints)
    if color is None:
        color = col

    def _func_and(x):
        if x.all():
            return 1
        return 0

    points = np.zeros((num_joints, 3))
    for d in range(dim):
        points[:,d] = pose[:,d]
    for i in range(num_joints):
        points[i, 2] = max(0, points[i, 2])

    valid = np.apply_along_axis(_func_and, axis=1, arr=(points[:,0:2] > -1e6))

    if subplot is None:
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
    else:
        fig = None
        ax = subplot

    for j in range(num_joints):
        if valid[j]:
            x, y, z = points[j]
            ax.scatter([z], [x], [y], lw=lw, c=color[cmap[j]])

    for i in links:
        if valid[i[0]] and valid[i[1]]:
            c = color[cmap[i[0]]]
            ax.plot(points[i, 2], points[i, 0], points[i, 1], c=c, lw=lw)

    ax.view_init(10, azimuth)
    ax.set_aspect('equal')
    ax.set_xlabel('Z (depth)')
    ax.set_ylabel('X (width)')
    ax.set_zlabel('Y (height)')
    ax.set_xlim([0, 1.])
    ax.set_ylim([0, 1.])
    ax.set_zlim([0, 1.])
    plt.gca().invert_xaxis()
    plt.gca().invert_zaxis()

    if fig is not None:
        if filename:
            fig.savefig(filename, bbox_inches='tight', pad_inches=0)
        else:
            plt.show()
        plt.close(fig)


def _plot_bbox(subplot, bbox, h=None, w=None, scale=16, lw=2, c=None):
    assert len(bbox) == 4

    b = bbox.copy()
    if w is not None:
       b[0] *= w
       b[2] *= w
    if h is not None:
       b[1] *= h
       b[3] *= h

    if c is None:
        c = hex_colors[np.random.randint(len(hex_colors))]

    x = np.array([b[0], b[2], b[2], b[0], b[0]])
    y = np.array([b[1], b[1], b[3], b[3], b[1]])
    subplot.plot(x, y, lw=lw, c=c, zorder=1)


def plot_skeleton_2d(subplot, skel, h=None, w=None,
        joints=True, links=True, scale=16, lw=4):

    s = skel.copy()
    num_joints = len(s)
    assert ((num_joints == 16) or (num_joints == 17)) or (num_joints == 20), \
            'Unsupported number of joints (%d)' % num_joints

    color, cmap, links = _get_poselayout(num_joints)

    x = s[:,0]
    y = s[:,1]
    v = s > -1e6
    v = v.any(axis=1).astype(np.float32)

    # Convert normalized skeletons to image coordinates.
    if w is not None:
        x *= w
    if h is not None:
        y *= h

    if joints:
        for i in range(len(v)):
            if v[i] > 0:
                c = color[cmap[i]]
                subplot.scatter(x=x[i], y=y[i], c=c, lw=lw, s=scale, zorder=2)

    if links:
        for i in links:
            if ((v[i[0]] > 0) and (v[i[1]] > 0)):
                c = color[cmap[i[0]]]
                subplot.plot(x[i], y[i], lw=lw, c=c, zorder=1)

In [None]:



try:
    from mpl_toolkits.mplot3d import Axes3D
    import matplotlib.pyplot as plt
except Exception as e:
    printcn(FAIL, str(e))
    plt = None


def data_to_image(x, gray_scale=False):
    """ Convert 'x' to a RGB Image object.

    # Arguments
        x: image in the format (num_cols, num_rows, 3) for RGB images or
            (num_cols, num_rows) for gray scale images. If None, return a
            light gray image with size 100x100.
        gray_scale: convert the RGB color space to a RGB gray scale space.
    """

    if x is None:
        x = 224 * np.ones((100, 100, 3), dtype=np.uint8)

    if x.max() - x.min() > 0.:
        buf = 255. * (x - x.min()) / (x.max() - x.min())
    else:
        buf = x.copy()

    if len(buf.shape) == 3:
        (w, h) = buf.shape[0:2]
        num_ch = buf.shape[2]
    else:
        (h, w) = buf.shape
        num_ch = 1

    if ((num_ch is 3) and gray_scale):
        g = 0.2989*buf[:,:,0] + 0.5870*buf[:,:,1] + 0.1140*buf[:,:,2]
        buf[:,:,0] = g
        buf[:,:,1] = g
        buf[:,:,2] = g
    elif num_ch is 1:
        aux = np.zeros((h, w, 3), dtype=buf.dtype)
        aux[:,:,0] = buf
        aux[:,:,1] = buf
        aux[:,:,2] = buf
        buf = aux

    return Image.fromarray(buf.astype(np.uint8), 'RGB')


def show(x, gray_scale=False, jet_cmap=False, filename=None):
    """ Show 'x' as an image on the screen.
    """
    if jet_cmap is False:
        img = data_to_image(x, gray_scale=gray_scale)
    else:
        if plt is None:
            printcn(WARNING, 'pyplot not defined!')
            return
        cmap = plt.cm.jet
        norm = plt.Normalize(vmin=x.min(), vmax=x.max())
        img = cmap(norm(x))
    if filename:
        plt.imsave(filename, img)
    else:
        plt.imshow(img)
        plt.show()


def draw(x=None,
        skels=[],
        bboxes=[],
        bbox_color='g',
        abs_pos=False,
        plot3d=False,
        single_window=False,
        figsize=(16,9),
        axis='on',
        facecolor='white',
        azimuth=65,
        dpi=100,
        filename=None):

    # Configure the ploting environment
    if plt is None:
        printcn(WARNING, 'pyplot not defined!')
        return

    """ Plot 'x' and draw over it the skeletons and the bounding boxes.
    """
    img = data_to_image(x)
    if abs_pos:
        w = None
        h = None
    else:
        w,h = img.size

    def add_subimage(f, subplot, img):
        ax = f.add_subplot(subplot)
        plt.imshow(img, zorder=-1)
        return ax

    fig = [plt.figure(figsize=figsize)]
    ax = []

    if plot3d:
        if single_window:
            ax.append(add_subimage(fig[0], 121, img))
            ax.append(fig[0].add_subplot(122, projection='3d'))
        else:
            ax.append(add_subimage(fig[0], 111, img))
            fig.append(plt.figure(figsize=figsize))
            ax.append(fig[1].add_subplot(111, projection='3d'))
    else:
        ax.append(add_subimage(fig[0], 111, img))

    plt.axis(axis)

    # Plotting skeletons if not None
    if skels is not None:
        if isinstance(skels, list) or len(skels.shape) == 3:
            for s in skels:
                plot_skeleton_2d(ax[0], s, h=h, w=w)
            if plot3d:
                plot_3d_pose(s, subplot=ax[-1], azimuth=azimuth)
        else:
            plot_skeleton_2d(ax[0], skels, h=h, w=w)
            if plot3d:
                plot_3d_pose(skels, subplot=ax[-1], azimuth=azimuth)

    # Plotting bounding boxes if not None
    if bboxes is not None:
        if isinstance(bboxes, list) or len(bboxes.shape) == 3:
            for b, c in zip(bboxes, bbox_color):
                _plot_bbox(ax[0], b, h=h, w=w, c=c, lw=4)
        else:
            _plot_bbox(ax[0], bboxes, h=h, w=w, c=bbox_color, lw=4)


    if filename:
        fig[0].savefig(filename, bbox_inches='tight', pad_inches=0,
                facecolor=facecolor, dpi=dpi)
        if plot3d and (single_window is False):
            fig[-1].savefig(filename + '.eps',
                    bbox_inches='tight', pad_inches=0)
    else:
        plt.show()

    for i in range(len(fig)):
        plt.close(fig[i])


def _get_poselayout(num_joints):
    if num_joints == 16:
        return pa16j2d.color, pa16j2d.cmap, pa16j2d.links
    elif num_joints == 17:
        return pa17j3d.color, pa17j3d.cmap, pa17j3d.links
    elif num_joints == 20:
        return pa20j3d.color, pa20j3d.cmap, pa20j3d.links


def plot_3d_pose(pose, subplot=None, filename=None, color=None, lw=3,
        azimuth=65):

    if plt is None:
        raise Exception('"matplotlib" is required for 3D pose plotting!')

    num_joints, dim = pose.shape
    assert dim in [2, 3], 'Invalid pose dimension (%d)' % dim
    assert ((num_joints == 16) or (num_joints == 17)) or (num_joints == 20), \
            'Unsupported number of joints (%d)' % num_joints

    col, cmap, links = _get_poselayout(num_joints)
    if color is None:
        color = col

    def _func_and(x):
        if x.all():
            return 1
        return 0

    points = np.zeros((num_joints, 3))
    for d in range(dim):
        points[:,d] = pose[:,d]
    for i in range(num_joints):
        points[i, 2] = max(0, points[i, 2])

    valid = np.apply_along_axis(_func_and, axis=1, arr=(points[:,0:2] > -1e6))

    if subplot is None:
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
    else:
        fig = None
        ax = subplot

    for j in range(num_joints):
        if valid[j]:
            x, y, z = points[j]
            ax.scatter([z], [x], [y], lw=lw, c=color[cmap[j]])

    for i in links:
        if valid[i[0]] and valid[i[1]]:
            c = color[cmap[i[0]]]
            ax.plot(points[i, 2], points[i, 0], points[i, 1], c=c, lw=lw)

    ax.view_init(10, azimuth)
    ax.set_aspect('equal')
    ax.set_xlabel('Z (depth)')
    ax.set_ylabel('X (width)')
    ax.set_zlabel('Y (height)')
    ax.set_xlim([0, 1.])
    ax.set_ylim([0, 1.])
    ax.set_zlim([0, 1.])
    plt.gca().invert_xaxis()
    plt.gca().invert_zaxis()

    if fig is not None:
        if filename:
            fig.savefig(filename, bbox_inches='tight', pad_inches=0)
        else:
            plt.show()
        plt.close(fig)


def _plot_bbox(subplot, bbox, h=None, w=None, scale=16, lw=2, c=None):
    assert len(bbox) == 4

    b = bbox.copy()
    if w is not None:
       b[0] *= w
       b[2] *= w
    if h is not None:
       b[1] *= h
       b[3] *= h

    if c is None:
        c = hex_colors[np.random.randint(len(hex_colors))]

    x = np.array([b[0], b[2], b[2], b[0], b[0]])
    y = np.array([b[1], b[1], b[3], b[3], b[1]])
    subplot.plot(x, y, lw=lw, c=c, zorder=1)


def plot_skeleton_2d(subplot, skel, h=None, w=None,
        joints=True, links=True, scale=16, lw=4):

    s = skel.copy()
    num_joints = len(s)
    assert ((num_joints == 16) or (num_joints == 17)) or (num_joints == 20), \
            'Unsupported number of joints (%d)' % num_joints

    color, cmap, links = _get_poselayout(num_joints)

    x = s[:,0]
    y = s[:,1]
    v = s > -1e6
    v = v.any(axis=1).astype(np.float32)

    # Convert normalized skeletons to image coordinates.
    if w is not None:
        x *= w
    if h is not None:
        y *= h

    if joints:
        for i in range(len(v)):
            if v[i] > 0:
                c = color[cmap[i]]
                subplot.scatter(x=x[i], y=y[i], c=c, lw=lw, s=scale, zorder=2)

    if links:
        for i in links:
            if ((v[i[0]] > 0) and (v[i[1]] > 0)):
                c = color[cmap[i[0]]]
                subplot.plot(x[i], y[i], lw=lw, c=c, zorder=1)


In [None]:
class AffineTransform(object):
    def __init__(self):
        self.afmat = np.eye(3)

    def _apply(self, t):
        self.afmat = np.dot(t, self.afmat)

    def scale(self, w, h):
        t = np.eye(3)
        t[0,0] *= w
        t[1,1] *= h
        self._apply(t)

    def translate(self, x, y):
        t = np.eye(3)
        t[0,2] = x
        t[1,2] = y
        self._apply(t)

    def rotate(self, angle, center):
        self.translate(-center[0], -center[1])
        self.rotate_center(angle)
        self.translate(center[0], center[1])

    def rotate_center(self, angle):
        t = np.eye(3)
        angle *= np.pi / 180
        a = np.cos(angle)
        b = np.sin(angle)
        t[0,0] = a
        t[0,1] = b
        t[1,1] = a
        t[1,0] = -b
        self._apply(t)

    def affine_hflip(self):
        t = np.eye(3)
        t[0,0] = -1
        self._apply(t)


class T(AffineTransform):
    def __init__(self, img, img_size=None):
        self.img = img
        if img_size is not None:
            self.img_size = tuple(img_size)
        else:
            self.img_size = img_size
        self.hflip = False
        AffineTransform.__init__(self)

    def resize(self, size, resample=Image.BILINEAR):
        t = self.scale(size[0] / self.size[0], size[1] / self.size[1])
        if self.img is not None:
            self.img = self.img.resize(size, resample)
        else:
            self.img_size = tuple(size)

    def normalize_affinemap(self):
        t = self.scale(1 / self.size[0], 1 / self.size[1])

    def crop(self, box):
        t = self.translate(-box[0], -box[1])
        if self.img is not None:
            self.img = self.img.crop(box)
        else:
            self.img_size = (box[2] - box[0], box[3] - box[1])

    def rotate_crop(self, angle, center, winsize,
            resample=Image.BILINEAR):
        """Rotate, crop, and resize the image.

        # Arguments
            angle: angle to rotate in degrees.
            winsize: (w, h) window size to crop in the input image.
            center: center point (x,y) to rotate from, None to use the
                image center.
            resample: rescaling method, according to PIL.Image.
        """

        if center is None:
            center = (self.size[0]/2, self.size[1]/2)

        if angle != 0:
            self.rotate(angle, center)

        # Compute the margins after rotation
        corners = np.array([
            [0, 0],
            [self.size[0], 0],
            [0, self.size[1]],
            [self.size[0], self.size[1]]
            ]).transpose()
        corners = transform_2d_points(self.afmat, corners)

        # Translate to zero margin
        self.translate(-min(corners[0,:]), -min(corners[1,:]))

        # Rotate image
        if (self.img is not None) and (angle != 0):
            self.img = self.img.rotate(angle, resample, expand=True)

        center = np.array([center[0], center[1]])
        center = transform_2d_points(self.afmat, center)

        crop = np.array([center[0] - winsize[0]/2, center[1] - winsize[1]/2,
            center[0] + winsize[0]/2, center[1] + winsize[1]/2], dtype=int)
        self.crop(crop)


    def horizontal_flip(self):
        self.affine_hflip()
        self.translate(self.size[0], 0)
        if self.img is not None:
            self.img = self.img.transpose(Image.FLIP_LEFT_RIGHT)
        self.hflip = not self.hflip

    def asarray(self, dtype=np.float32):
        if self.img is not None:
            return np.asarray(self.img, dtype=dtype)
        else:
            return np.zeros(self.img_size + (3,))

    @property
    def size(self):
        if self.img is not None:
            return self.img.size
        else:
            return self.img_size


def transform_2d_points(A, x, transpose=False, inverse=False):
    """Apply a given affine transformation to 2D points.

    # Arguments
        A: [3, 3] affine transformation map: T(x) = Ax.
        x: [dim, N] points (normal case, otherwise, set the flag 'transpose').
        transpose: flag to be setted if 'x' is [N, dim].
        inverse: flag to apply the inverse transformation on A.

    # Return
        The transformed points.
    """

    squeeze = False
    if len(x.shape) == 1:
        x = np.expand_dims(x, axis=-1)
        squeeze = True
    elif transpose:
        x = np.transpose(x)

    (dim, N) = x.shape
    assert (dim == 2), \
            'transform_2d_points: Only 2D points are supported, get ' +str(dim)

    if inverse:
        A = np.linalg.inv(A)

    y = np.ones((dim+1, N))
    y[0:dim,:] = x[0:dim,:]
    y = np.dot(A, y)[0:dim]

    if squeeze:
        return np.squeeze(y)
    if transpose:
        return np.transpose(y)
    return y





def normalize_channels(frame, channel_power=1):

    if type(channel_power) is not int:
        assert len(channel_power) == 3, 'channel_power expected to be int or ' \
                + 'tuple/list with len=3, {} given.'.format(channel_power)

    frame /= 255.

    if type(channel_power) is int:
        if channel_power != 1:
            frame = np.power(frame, channel_power)
    else:
        for c in range(3):
            if channel_power[c] != 1:
                frame[:,:, c] = np.power(frame[:,:, c], channel_power[c])

    frame -= .5
    frame *= 2.

    return frame


# **Into the Data**


*   Reffer to the link above in order to fin the datasets of MPPI for pose estimation and Penn Action for action recognitioon.




In [None]:

def get_clip_frame_index(sequence_size, subsample, num_frames,
        random_clip=False):

    # Assert that subsample is integer and positive
    assert (type(subsample) == int) and subsample > 0

    idx_coef = 1.
    while idx_coef*sequence_size < num_frames:
        idx_coef *= 1.5
    sequence_size *= idx_coef

    # Check if the given subsample value is feasible, otherwise, reduce
    # it to the maximum acceptable value.
    max_subsample = int(sequence_size / num_frames)
    if subsample > max_subsample:
        subsample = max_subsample

    vidminf = subsample * (num_frames - 1) + 1 # Video min num of frames
    maxs = sequence_size - vidminf # Maximum start
    if random_clip:
        start = np.random.randint(maxs + 1)
    else:
        start = int(maxs / 2)

    frames = list(range(start, start + vidminf, subsample))
    if idx_coef > 1:
        for i in range(len(frames)):
            frames[i] = int(frames[i] / idx_coef)

    return frames

In [None]:


ACTION_LABELS = None

def load_h36m_mat_annotation(filename):
    mat = sio.loadmat(filename, struct_as_record=False, squeeze_me=True)

    # Respect the order of TEST (0), TRAIN (1), and VALID (2)
    sequences = [mat['sequences_te'], mat['sequences_tr'], mat['sequences_val']]
    action_labels = mat['action_labels']
    joint_labels = mat['joint_labels']

    return sequences, action_labels, joint_labels


def serialize_index_sequences(seq):
    frames_idx = []
    for s in range(len(seq)):
        for f in range(len(seq[s].frames)):
            frames_idx.append((s, f))

    return frames_idx


class Human36M(object):
    """Implementation of the Human3.6M dataset for 3D pose estimation and
    action recognition.
    """

    def __init__(self, dataset_path, dataconf, poselayout=pa17j3d,
            topology='sequences', clip_size=16):

        assert topology in ['sequences', 'frames'], \
                'Invalid topology ({})'.format(topology)

        self.dataset_path = dataset_path
        self.dataconf = dataconf
        self.poselayout = poselayout
        self.topology = topology
        self.clip_size = clip_size
        self.load_annotations(os.path.join(dataset_path, 'annotations.mat'))

    def load_annotations(self, filename):
        try:
            self.sequences, self.action_labels, self.joint_labels = \
                    load_h36m_mat_annotation(filename)
            self.frame_idx = [serialize_index_sequences(self.sequences[0]),
                    serialize_index_sequences(self.sequences[1]),
                    serialize_index_sequences(self.sequences[2])]

            global ACTION_LABELS
            ACTION_LABELS = self.action_labels

        except:
            warning('Error loading Human3.6M dataset!')
            raise


    def get_data(self, key, mode, frame_list=None, fast_crop=False):
        output = {}

        if mode == TRAIN_MODE:
            dconf = self.dataconf.random_data_generator()
            random_clip = True
        else:
            dconf = self.dataconf.get_fixed_config()
            random_clip = False

        if self.topology == 'sequences':
            seq = self.sequences[mode][key]
            if frame_list == None:
                frame_list = get_clip_frame_index(len(seq.frames),
                        dconf['subspl'], self.clip_size,
                        random_clip=random_clip)
            objframes = seq.frames[frame_list]
        else:
            seq_idx, frame_idx = self.frame_idx[mode][key]
            seq = self.sequences[mode][seq_idx]
            objframes = seq.frames[[frame_idx]]

        """Build a Camera object"""
        cpar = seq.camera_parameters
        cam = Camera(cpar.R, cpar.T, cpar.f, cpar.c, cpar.p, cpar.k)

        """Load and project the poses"""
        pose_w = self.load_pose_annot(objframes)
        pose_uvd = cam.project(np.reshape(pose_w, (-1, 3)))
        pose_uvd = np.reshape(pose_uvd,
                (len(objframes), self.poselayout.num_joints, 3))

        """Compute GT bouding box."""
        imgsize = (objframes[0].w, objframes[0].h)
        objpos, winsize, zrange = get_crop_params(pose_uvd[:, 0, :],
                imgsize, cam.f, dconf['scale'])

        objpos += dconf['scale'] * np.array([dconf['transx'], dconf['transy']])
        frames = np.empty((len(objframes),) + self.dataconf.input_shape)
        pose = np.empty((len(objframes), self.poselayout.num_joints,
            self.poselayout.dim))

        for i in range(len(objframes)):
            image = 'images/%s/%05d.jpg' % (seq.name, objframes[i].f)
            imgt = T(Image.open(os.path.join(self.dataset_path, image)))

            imgt.rotate_crop(dconf['angle'], objpos, winsize)
            if dconf['hflip'] == 1:
                imgt.horizontal_flip()

            imgt.resize(self.dataconf.crop_resolution)
            imgt.normalize_affinemap()
            frames[i, :, :, :] = normalize_channels(imgt.asarray(),
                    channel_power=dconf['chpower'])

            pose[i, :, 0:2] = transform_2d_points(imgt.afmat,
                    pose_uvd[i, :,0:2], transpose=True)
            pose[i, :, 2] = \
                    (pose_uvd[i, :, 2] - zrange[0]) / (zrange[1] - zrange[0])

            if imgt.hflip:
                pose[i, :, :] = pose[i, self.poselayout.map_hflip, :]

        """Set outsider body joints to invalid (-1e9)."""
        pose = np.reshape(pose, (-1, self.poselayout.dim))
        pose[np.isnan(pose)] = -1e9
        v = np.expand_dims(get_visible_joints(pose[:,0:2]), axis=-1)
        pose[(v==0)[:,0],:] = -1e9
        pose = np.reshape(pose, (len(objframes), self.poselayout.num_joints,
            self.poselayout.dim))
        v = np.reshape(v, (len(objframes), self.poselayout.num_joints, 1))

        pose = np.concatenate((pose, v), axis=-1)
        if self.topology != 'sequences':
            pose_w = np.squeeze(pose_w, axis=0)
            pose_uvd = np.squeeze(pose_uvd, axis=0)
            pose = np.squeeze(pose, axis=0)
            frames = np.squeeze(frames, axis=0)

        output['camera'] = cam.serialize()
        output['action'] = int(seq.name[1:3]) - 1
        output['pose_w'] = pose_w
        output['pose_uvd'] = pose_uvd
        output['pose'] = pose
        output['frame'] = frames

        """Take the last transformation matrix, it should not change"""
        output['afmat'] = imgt.afmat.copy()

        return output

    def load_pose_annot(self, frames):
        p = np.empty((len(frames), self.poselayout.num_joints,
            self.poselayout.dim))
        for i in range(len(frames)):
            p[i,:] = frames[i].pose3d.T[self.poselayout.map_from_h36m,
                    0:self.poselayout.dim].copy()

        return p

    def clip_length(self):
        if self.topology == 'sequences':
            return self.clip_size
        else:
            return None

    def clip_shape(self):
        if self.topology == 'sequences':
            return (self.clip_size,)
        else:
            return ()

    def get_shape(self, dictkey):
        if dictkey == 'frame':
            return self.clip_shape() + self.dataconf.input_shape
        if dictkey == 'pose':
            return self.clip_shape() \
                    + (self.poselayout.num_joints, self.poselayout.dim+1)
        if dictkey == 'pose_w':
            return self.clip_shape() \
                    + (self.poselayout.num_joints, self.poselayout.dim)
        if dictkey == 'pose_uvd':
            return self.clip_shape() \
                    + (self.poselayout.num_joints, self.poselayout.dim)
        if dictkey == 'action':
            return (1,)
        if dictkey == 'camera':
            return (21,)
        if dictkey == 'afmat':
            return (3, 3)
        raise Exception('Invalid dictkey on get_shape!')

    def get_length(self, mode):
        if self.topology == 'sequences':
            return len(self.sequences[mode])
        else:
            return len(self.frame_idx[mode])

In [None]:
class BatchLoader(Sequence):
    """Loader class for generic datasets, based on the Sequence class from
    Keras.

    One (or more) object(s) implementing a dataset should be provided.
    The required functions are 'get_length(self, mode)' and
    'get_data(self, key, mode)'. The first returns an integer, and the last
    returns a dictionary containing the data for a given pair of (key, mode).

    # Arguments
        dataset: A dataset object, or a list of dataset objects (for multiple
            datasets), which are merged by this class.
        x_dictkeys: Key names (strings) to constitute the baches of X data
            (input).
        y_dictkeys: Identical to x_dictkeys, but for Y data (labels).
            All given datasets must provide those keys.
        batch_size: Number of samples in each batch. If multiple datasets, it
            can be a list with the same length of 'dataset', where each value
            corresponds to the number of samples from the respective dataset,
            or it can be a single value, which corresponds to the number of
            samples from *each* dataset.
        num_predictions: number of predictions (y) that should be repeated for
            training.
        mode: TRAIN_MODE, TEST_MODE, or VALID_MODE.
        shuffle: boolean to shuffle *samples* (not batches!) or not.
        custom_dummy_dictkey: Allows to generate dummy outputs for each batch.
            Should be defined as a list of tuples, each with three values:
            (dictkey, shape, value). It is useful to include an action label for
            a sequence poses from pose-only datasets, e.g., when mixturing MPII
            and Human3.6M for training with action recognition at the same time
            (to further mergning with an action dataset).
    """
    BATCH_HOLD = 4

    def __init__(self, dataset, x_dictkeys, y_dictkeys, mode,
            batch_size=24, num_predictions=1, shuffle=True,
            custom_dummy_dictkey=[]):

        if not isinstance(dataset, list):
            dataset = [dataset]
        self.datasets = dataset
        self.x_dictkeys = x_dictkeys
        self.y_dictkeys = y_dictkeys
        self.allkeys = x_dictkeys + y_dictkeys

        """Include custom dictkeys into the output list."""
        self.custom_dummy_dictkey = custom_dummy_dictkey
        self.custom_dictkeys = []
        for dummyout in self.custom_dummy_dictkey:
            assert dummyout[0] not in self.y_dictkeys, \
                    'dummy key {} already in y_dictkeys!'.format(dummyout[0])
            self.custom_dictkeys.append(dummyout[0])
        self.y_dictkeys += self.custom_dictkeys

        """Make sure that all datasets have the same shapes for all dictkeys"""
        for dkey in self.allkeys:
            for i in range(1, len(self.datasets)):
                assert self.datasets[i].get_shape(dkey) == \
                        self.datasets[i-1].get_shape(dkey), \
                        'Incompatible dataset shape for dictkey {}'.format(dkey)

        self.batch_sizes = batch_size
        if not isinstance(self.batch_sizes, list):
            self.batch_sizes = len(self.datasets)*[self.batch_sizes]

        assert len(self.datasets) == len(self.batch_sizes), \
                'dataset and batch_size should be lists with the same length.'

        if isinstance(num_predictions, int):
            self.num_predictions = len(self.y_dictkeys)*[num_predictions]
        elif isinstance(num_predictions, list):
            self.num_predictions = num_predictions
        else:
            raise ValueError(
                'Invalid num_predictions ({})'.format(num_predictions))

        assert len(self.num_predictions) == len(self.y_dictkeys), \
                'num_predictions and y_dictkeys not matching'

        self.mode = mode
        self.shuffle = shuffle

        """Create one lock object for each dataset in case of data shuffle."""
        if self.shuffle:
            self.qkey = []
            self.lock = []
            for d in range(self.num_datasets):
                maxsize = self.datasets[d].get_length(self.mode) \
                        + BatchLoader.BATCH_HOLD*self.batch_sizes[d]
                self.qkey.append(Queue(maxsize=maxsize))
                self.lock.append(threading.Lock())

    def __len__(self):
        dataset_len = []
        for d in range(self.num_datasets):
            dataset_len.append(
                    int(np.ceil(self.datasets[d].get_length(self.mode) /
                        float(self.batch_sizes[d]))))

        return max(dataset_len)


    def __getitem__(self, idx):
        data_dict = self.get_data(idx, self.mode)

        """Convert the dictionary of samples to a list for x and y."""
        x_batch = []
        for dkey in self.x_dictkeys:
            x_batch.append(data_dict[dkey])

        y_batch = []
        for i, dkey in enumerate(self.y_dictkeys):
            for _ in range(self.num_predictions[i]):
                y_batch.append(data_dict[dkey])

        return x_batch, y_batch

    def get_batch_size(self):
        return sum(self.batch_sizes)

    def get_data(self, idx, mode):
        """Get the required data by mergning all the datasets as specified
        by the object's parameters."""
        data_dict = {}
        for dkey in self.allkeys:
            data_dict[dkey] = np.empty((sum(self.batch_sizes),) \
                    + self.datasets[0].get_shape(dkey))

        """Add custom dummy data."""
        for dummyout in self.custom_dummy_dictkey:
            dkey, dshape, dvalue = dummyout
            data_dict[dkey] = dvalue * np.ones(dshape)

        batch_cnt = 0
        for d in range(len(self.datasets)):
            for i in range(self.batch_sizes[d]):
                if self.shuffle:
                    key = self.get_shuffled_key(d)
                else:
                    key = idx*self.batch_sizes[d] + i
                    if key >= self.datasets[d].get_length(mode):
                        key -= self.datasets[d].get_length(mode)

                data = self.datasets[d].get_data(key, mode)
                for dkey in self.allkeys:
                    data_dict[dkey][batch_cnt, :] = data[dkey]

                batch_cnt += 1

        return data_dict

    def get_shape(self, dictkey):
        """Inception of get_shape method.
        First check if it is a custom key.
        """
        for dummyout in self.custom_dummy_dictkey:
            if dictkey == dummyout[0]:
                return dummyout[1]
        return (sum(self.batch_sizes),) + self.datasets[0].get_shape(dictkey)

    def get_length(self, mode):
        assert mode == self.mode, \
                'You are mixturing modes! {} with {}'.format(mode, self.mode)
        return len(self)

    def get_shuffled_key(self, dataset_idx):
        assert self.shuffle, \
                'There is not sense in calling this function if shuffle=False!'

        key = None
        with self.lock[dataset_idx]:
            min_samples = BatchLoader.BATCH_HOLD*self.batch_sizes[dataset_idx]
            if self.qkey[dataset_idx].qsize() <= min_samples:
                """Need to fill that list."""
                num_samples = self.datasets[dataset_idx].get_length(self.mode)
                newlist = list(range(num_samples))
                random.shuffle(newlist)
                try:
                    for j in newlist:
                        self.qkey[dataset_idx].put(j, False)
                except queue.Full:
                    pass
            key = self.qkey[dataset_idx].get()

        return key

    @property
    def num_datasets(self):
        return len(self.datasets)



In [None]:
def load_mpii_mat_annotation(filename):
    mat = sio.loadmat(filename)
    annot_tr = mat['annot_tr']
    annot_val = mat['annot_val']

    # Respect the order of TEST (0), TRAIN (1), and VALID (2)
    rectidxs = [None, annot_tr[0,:], annot_val[0,:]]
    images = [None, annot_tr[1,:], annot_val[1,:]]
    annorect = [None, annot_tr[2,:], annot_val[2,:]]

    return rectidxs, images, annorect


def serialize_annorect(rectidxs, annorect):
    assert len(rectidxs) == len(annorect)

    sample_list = []
    for i in range(len(rectidxs)):
        rec = rectidxs[i]
        for j in range(rec.size):
            idx = rec[j,0]-1 # Convert idx from Matlab
            ann = annorect[i][idx,0]
            annot = {}
            annot['head'] = ann['head'][0,0][0]
            annot['objpos'] = ann['objpos'][0,0][0]
            annot['scale'] = ann['scale'][0,0][0,0]
            annot['pose'] = ann['pose'][0,0]
            annot['imgidx'] = i
            sample_list.append(annot)

    return sample_list


def calc_head_size(head_annot):
    head = np.array([float(head_annot[0]), float(head_annot[1]),
        float(head_annot[2]), float(head_annot[3])])
    return 0.6 * np.linalg.norm(head[0:2] - head[2:4])


class MpiiSinglePerson(object):
    """Implementation of the MPII dataset for single person.
    """

    def __init__(self, dataset_path, dataconf,
            poselayout=pa16j2d,
            remove_outer_joints=True):

        self.dataset_path = dataset_path
        self.dataconf = dataconf
        self.poselayout = poselayout
        self.remove_outer_joints = remove_outer_joints
        self.load_annotations(os.path.join(dataset_path, 'annotations.mat'))

    def load_annotations(self, filename):
        try:
            #rectidxs, images, annorect = load_mpii_mat_annotation(filename)
            rectidxs, images, annorect = load_mpii_mat_annotation(filename)
            self.samples = {}
            self.samples[TEST_MODE] = [] # No samples for test
            self.samples[TRAIN_MODE] = serialize_annorect(
                    rectidxs[TRAIN_MODE], annorect[TRAIN_MODE])
            self.samples[VALID_MODE] = serialize_annorect(
                    rectidxs[VALID_MODE], annorect[VALID_MODE])
            self.images = images

        except:
            warning('Error loading the MPII dataset!')
            raise

    def load_image(self, key, mode):
        try:
            annot = self.samples[mode][key]
            image = self.images[mode][annot['imgidx']][0]
            imgt = T(Image.open(os.path.join(
                self.dataset_path, 'images', image)))
        except:
            warning('Error loading sample key/mode: %d/%d' % (key, mode))
            raise

        return imgt

    def get_data(self, key, mode, fast_crop=False):
        output = {}

        if mode == TRAIN_MODE:
            dconf = self.dataconf.random_data_generator()
        else:
            dconf = self.dataconf.get_fixed_config()

        imgt = self.load_image(key, mode)
        annot = self.samples[mode][key]

        scale = 1.25*annot['scale']
        objpos = np.array([annot['objpos'][0], annot['objpos'][1] + 12*scale])
        objpos += scale * np.array([dconf['transx'], dconf['transy']])
        winsize = 200 * dconf['scale'] * scale
        winsize = (winsize, winsize)
        output['bbox'] = objposwin_to_bbox(objpos, winsize)

        if fast_crop:
            """Slightly faster method, but gives lower precision."""
            imgt.crop_resize_rotate(objpos, winsize,
                    self.dataconf.crop_resolution, dconf['angle'])
        else:
            imgt.rotate_crop(dconf['angle'], objpos, winsize)
            imgt.resize(self.dataconf.crop_resolution)

        if dconf['hflip'] == 1:
            imgt.horizontal_flip()

        imgt.normalize_affinemap()
        output['frame'] = normalize_channels(imgt.asarray(),
                channel_power=dconf['chpower'])

        p = np.empty((self.poselayout.num_joints, self.poselayout.dim))
        p[:] = np.nan

        head = annot['head']
        p[self.poselayout.map_to_mpii, 0:2] = \
                transform_2d_points(imgt.afmat, annot['pose'].T, transpose=True)
        if imgt.hflip:
            p = p[self.poselayout.map_hflip, :]

        # Set invalid joints and NaN values as an invalid value
        p[np.isnan(p)] = -1e9
        v = np.expand_dims(get_visible_joints(p[:,0:2]), axis=-1)
        if self.remove_outer_joints:
            p[(v==0)[:,0],:] = -1e9

        output['pose'] = np.concatenate((p, v), axis=-1)
        output['headsize'] = calc_head_size(annot['head'])
        output['afmat'] = imgt.afmat.copy()

        return output

    def get_shape(self, dictkey):
        if dictkey == 'frame':
            return self.dataconf.input_shape
        if dictkey == 'pose':
            return (self.poselayout.num_joints, self.poselayout.dim+1)
        if dictkey == 'headsize':
            return (1,)
        if dictkey == 'afmat':
            return (3, 3)
        raise Exception('Invalid dictkey on get_shape!')

    def get_length(self, mode):
        return len(self.samples[mode])


In [None]:

use_small_images = True
image_prefix = 'images-small' if use_small_images else 'images'
video_subsample = 2

ACTION_LABELS = ['drink water', 'eat meal/snack', 'brushing teeth',
        'brushing hair', 'drop', 'pickup', 'throw', 'sitting down',
        'standing up (from sitting position)', 'clapping', 'reading',
        'writing', 'tear up paper', 'wear jacket', 'take off jacket',
        'wear a shoe', 'take off a shoe', 'wear on glasses',
        'take off glasses', 'put on a hat/cap', 'take off a hat/cap',
        'cheer up', 'hand waving', 'kicking something',
        'put something inside pocket / take out something from pocket',
        'hopping (one foot jumping)', 'jump up',
        'make a phone call/answer phone', 'playing with phone/tablet',
        'typing on a keyboard', 'pointing to something with finger',
        'taking a selfie', 'check time (from watch)', 'rub two hands together',
        'nod head/bow', 'shake head', 'wipe face', 'salute',
        'put the palms together', 'cross hands in front (say stop)',
        'sneeze/cough', 'staggering', 'falling', 'touch head (headache)',
        'touch chest (stomachache/heart pain)', 'touch back (backache)',
        'touch neck (neckache)', 'nausea or vomiting condition',
        'use a fan (with hand or paper)/feeling warm',
        'punching/slapping other person', 'kicking other person',
        'pushing other person', 'pat on back of other person',
        'point finger at the other person', 'hugging other person',
        'giving something to other person', 'touch other person s pocket',
        'handshaking', 'walking towards each other',
        'walking apart from each other']

JOINT_LABELS = ['base of the spine', 'middle of the spine', 'neck', 'head',
        'left shoulder', 'left elbow', 'left wrist', 'left hand',
        'right shoulder', 'right elbow', 'right wrist', 'right hand',
        'left hip', 'left knee', 'left ankle', 'left foot', 'right hip',
        'right knee', 'right ankle', 'right foot', 'spine',
        'tip of the left hand', 'left thumb', 'tip of the right hand',
        'right thumb']

VIEWPOINT_LABELS = ['cam1', 'cam2', 'cam3']


def serialize_index_sequences(sequences):
    frame_idx = []
    for s in range(len(sequences)):
        for f in range(len(sequences[s])):
            frame_idx.append((s, f))

    return frame_idx


def ntu_load_annotations(dataset_path, eval_mode='cs',
        num_S=17, num_C=3, num_P=40, num_R=2, num_A=60):

    # Saniry checks
    assert eval_mode in ['cs', 'cv'], \
        'Invalid evaluation mode {}'.format(eval_mode)

    ntud_numpy_dir = os.path.join(dataset_path, 'nturgb+d_numpy')
    ntud_images_dir = os.path.join(dataset_path, image_prefix)
    for d in [ntud_numpy_dir, ntud_images_dir]:
        assert os.path.isdir(d), \
            f'Error: check your NTU dataset! `{d}` not found!'

    min_num_frames = np.inf
    max_num_frames = -np.inf
    num_videos = [0, 0, 0]

    cs_train = [1, 2, 4, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19,
            25, 27, 28, 31, 34, 35, 38]
    cv_train = [2, 3]

    sequences = [[], [], []]
    seq_ids = [[], [], []]
    actions = [[], [], []]

    for s in range(1,num_S+1):
        for c in range(1,num_C+1):
            for p in range(1,num_P+1):
                for r in range(1,num_R+1):
                    for a in range(1,num_A+1):

                        sequence_id = \
                                'S%03dC%03dP%03dR%03dA%03d' % (s, c, p, r, a)
                        filename = os.path.join(ntud_numpy_dir,
                                sequence_id + '.npy')
                        if not os.path.isfile(filename):
                            continue # Ignore missing annotation files

                        if eval_mode == 'cs':
                            mode = TRAIN_MODE if p in cs_train else TEST_MODE
                        else:
                            mode = TRAIN_MODE if c in cv_train else TEST_MODE

                        data = np.load(filename)
                        if video_subsample is not None:
                            data = data[0::video_subsample, :]

                        """Compute some stats about the dataset."""
                        if len(data) < min_num_frames:
                            min_num_frames = len(data)
                        if len(data) > max_num_frames:
                            max_num_frames = len(data)
                        num_videos[mode] += 1

                        sequences[mode].append(data)
                        seq_ids[mode].append(sequence_id)
                        actions[mode].append(a)

    frame_idx = [serialize_index_sequences(sequences[0]),
            serialize_index_sequences(sequences[1]), []]

    printcn('', 'Max/Min number of frames: {}/{}'.format(
        max_num_frames, min_num_frames))
    printcn('', 'Number of videos: {}'.format(num_videos))

    return sequences, frame_idx, seq_ids, actions


class Ntu(object):
    def __init__(self, dataset_path, dataconf, poselayout=pa20j3d,
            topology='sequence', use_gt_bbox=False, remove_outer_joints=True,
            clip_size=16, pose_only=False, num_S=17):

        self.dataset_path = dataset_path
        self.dataconf = dataconf
        self.poselayout = poselayout
        self.topology = topology
        self.use_gt_bbox = use_gt_bbox
        self.clip_size = clip_size
        self.remove_outer_joints = remove_outer_joints
        self.pose_only = pose_only
        self.action_labels = ACTION_LABELS
        self.joint_labels = JOINT_LABELS

        try:
            self.sequences, self.frame_idx, self.seq_ids, self.actions = \
                    ntu_load_annotations(dataset_path, num_S=num_S)
        except:
            warning('Error loading NTU RGB+D dataset!')
            raise

    def get_data(self, key, mode, frame_list=None, bbox=None):
        """Method to load NTU samples specified by mode and key,
        do data augmentation and bounding box cropping.
        """
        output = {}

        if mode == TRAIN_MODE:
            dconf = self.dataconf.random_data_generator()
            random_clip = True
        else:
            dconf = self.dataconf.get_fixed_config()
            random_clip = False

        if self.topology == 'sequences':
            seq_idx = key
            seq = self.sequences[mode][seq_idx]
            seq_id = self.seq_ids[mode][seq_idx]
            act = self.actions[mode][seq_idx]
            if frame_list == None:
                frame_list = get_clip_frame_index(len(seq), dconf['subspl'],self.clip_size, random_clip=random_clip)
        else:
            seq_idx, frame_idx = self.frame_idx[mode][key]
            seq = self.sequences[mode][seq_idx]
            seq_id = self.seq_ids[mode][seq_idx]
            act = self.actions[mode][seq_idx]
            frame_list = [frame_idx]

        objframes = seq[frame_list]

        """Load pose annotation"""
        pose, visible = self.get_pose_annot(objframes)

        if use_small_images:
            w, h = (int(1920/2), int(1080/2))
        else:
            w, h = (1920, 1080)

        """Compute the ground truth bounding box, if not given"""
        if bbox is None:
            if self.use_gt_bbox:
                bbox = get_gt_bbox(pose[:, :, 0:2], visible, (w, h),
                        scale=dconf['scale'], logkey=key)
            else:
                bbox = objposwin_to_bbox(np.array([w / 2, h / 2]),
                        (dconf['scale']*max(w, h), dconf['scale']*max(w, h)))

        rootz = np.nanmean(pose[:, 0, 2])
        if np.isnan(rootz):
            rootz = np.nanmean(pose[:, :, 2], axis=(0, 1))

        zrange = np.array([rootz - dconf['scale']*1000,
            rootz + dconf['scale']*1000])

        objpos, winsize = bbox_to_objposwin(bbox)
        if min(winsize) < 32:
            winsize = (32, 32)
        objpos += dconf['scale'] * np.array([dconf['transx'], dconf['transy']])

        """Pre-process data for each frame"""
        if self.pose_only:
            frames = None
        else:
            frames = np.zeros((len(objframes),) + self.dataconf.input_shape)

        for i in range(len(objframes)):
            if self.pose_only:
                imgt = T(None, img_size=(w, h))
            else:
                imagepath = os.path.join(self.dataset_path, image_prefix,
                        seq_id, '%05d.jpg' % objframes[i][0])
                imgt = T(Image.open(imagepath))

            imgt.rotate_crop(dconf['angle'], objpos, winsize)
            imgt.resize(self.dataconf.crop_resolution)

            if dconf['hflip'] == 1:
                imgt.horizontal_flip()

            imgt.normalize_affinemap()
            if not self.pose_only:
                frames[i, :, :, :] = normalize_channels(imgt.asarray(),
                        channel_power=dconf['chpower'])

            pose[i, :, 0:2] = transform_2d_points(imgt.afmat, pose[i, :, 0:2],
                    transpose=True)
            pose[i, :, 2] = (pose[i, :, 2] -zrange[0]) / (zrange[1] -zrange[0])

            if imgt.hflip:
                pose[i, :, :] = pose[i, self.poselayout.map_hflip, :]

        """Set outsider body joints to invalid (-1e9)."""
        pose = np.reshape(pose, (-1, self.poselayout.dim))
        pose[np.isnan(pose)] = -1e9
        v = np.expand_dims(get_visible_joints(pose[:,0:2]), axis=-1)
        pose[(v==0)[:,0],:] = -1e9
        pose = np.reshape(pose, (len(objframes), self.poselayout.num_joints,
            self.poselayout.dim))
        v = np.reshape(v, (len(objframes), self.poselayout.num_joints, 1))

        pose = np.concatenate((pose, v), axis=-1)
        if self.topology != 'sequences':
            pose = np.squeeze(pose, axis=0)
            if not self.pose_only:
                frames = np.squeeze(frames, axis=0)

        action = np.zeros(self.get_shape('ntuaction'))
        action[act - 1] = 1.

        output['seq_idx'] = seq_idx
        output['frame_list'] = frame_list
        output['ntuaction'] = action
        output['pennaction'] = np.zeros((15,))
        output['pose'] = pose
        output['frame'] = frames

        """Take the last transformation matrix, it should not change"""
        output['afmat'] = imgt.afmat.copy()

        return output


    def get_pose_annot(self, frames):

        num_joints = len(JOINT_LABELS)
        pose = frames[:, 1+3*num_joints:]

        p = np.zeros((len(frames), num_joints, self.poselayout.dim))

        if use_small_images:
            p[:, :, 0] = pose[:, 0:num_joints] / 2.
            p[:, :, 1] = pose[:, num_joints:2*num_joints] / 2.
        else:
            p[:, :, 0] = pose[:, 0:num_joints]
            p[:, :, 1] = pose[:, num_joints:2*num_joints]

        if self.poselayout.dim == 3:
            p[:, :, 2] = pose[:, 2*num_joints:]

        p = p[:, self.poselayout.map_from_ntu, :].copy()
        v = np.apply_along_axis(lambda x: 1 if x.all() else 0,
                axis=2, arr=(p > 0))
        p[v==0, :] = np.nan

        return p, v

    def get_clip_index(self, key, mode, subsamples=[2]):
        assert self.topology == 'sequences', 'Topology not supported'

        seq = self.sequences[mode][key]
        index_list = []
        for sub in subsamples:
            start_frame = 0
            while True:
                last_frame = start_frame + self.clip_size * sub
                if last_frame > len(seq):
                    break
                index_list.append(range(start_frame, last_frame, sub))
                start_frame += int(self.clip_size / 2) + (sub - 1)

        return index_list


    def clip_length(self):
        if self.topology == 'sequences':
            return self.clip_size
        else:
            return None

    def clip_shape(self):
        if self.topology == 'sequences':
            return (self.clip_size,)
        else:
            return ()

    def get_shape(self, dictkey):
        if dictkey == 'frame':
            return self.clip_shape() + self.dataconf.input_shape
        if dictkey == 'pose':
            return self.clip_shape() \
                    + (self.poselayout.num_joints, self.poselayout.dim+1)
        if dictkey == 'ntuaction':
            return (len(self.action_labels),)
        if dictkey == 'pennaction':
            return (15,)
        if dictkey == 'afmat':
            return (3, 3)
        raise Exception('Invalid dictkey on get_shape!')

    def get_length(self, mode):
        if self.topology == 'sequences':
            return len(self.sequences[mode])
        else:
            return len(self.frame_idx[mode])


In [None]:

ACTION_LABELS = None

def load_pennaction_mat_annotation(filename):
    mat = sio.loadmat(filename, struct_as_record=False, squeeze_me=True)

    # Respect the order of TEST (0), TRAIN (1). No validation set.
    sequences = [mat['sequences_te'], mat['sequences_tr'], []]
    action_labels = mat['action_labels']
    joint_labels = mat['joint_labels']

    return sequences, action_labels, joint_labels


def serialize_index_sequences(sequences):
    frame_idx = []
    for s in range(len(sequences)):
        for f in range(len(sequences[s].frames)):
            frame_idx.append((s, f))

    return frame_idx


def compute_clip_bbox(bbox_dict, seq_idx, frame_list):
    x1 = y1 = np.inf
    x2 = y2 = -np.inf

    for f in frame_list:
        b = bbox_dict['%d.%d' % (seq_idx, f)]
        x1 = min(x1, b[0])
        y1 = min(y1, b[1])
        x2 = max(x2, b[2])
        y2 = max(y2, b[3])

    return np.array([x1, y1, x2, y2])


class PennAction(object):
    def __init__(self, dataset_path, dataconf, poselayout=pa16j2d,
            topology='sequence', use_gt_bbox=False, remove_outer_joints=True,
            clip_size=16, pose_only=False, output_fullframe=False,
            pred_bboxes_file=None):

        assert topology in ['sequences', 'frames'], \
                'Invalid topology ({})'.format(topology)

        self.dataset_path = dataset_path
        self.dataconf = dataconf
        self.poselayout = poselayout
        self.topology = topology
        self.use_gt_bbox = use_gt_bbox
        self.remove_outer_joints = remove_outer_joints
        self.clip_size = clip_size
        self.pose_only = pose_only
        self.output_fullframe = output_fullframe
        self.load_annotations(os.path.join(dataset_path, 'annotations.mat'))
        if pred_bboxes_file:
            filepath = os.path.join(dataset_path, pred_bboxes_file)
            with open(filepath, 'r') as fid:
                self.pred_bboxes = json.load(fid)
        else:
            self.pred_bboxes = None

    def load_annotations(self, filename):
        try:
            self.sequences, self.action_labels, self.joint_labels = \
                    load_pennaction_mat_annotation(filename)
            self.frame_idx = [serialize_index_sequences(self.sequences[0]),
                    serialize_index_sequences(self.sequences[1]), []]

            global ACTION_LABELS
            ACTION_LABELS = self.action_labels

        except:
            warning('Error loading PennAction dataset!')
            raise

    def get_data(self, key, mode, frame_list=None, bbox=None):
        """Method to load Penn Action samples specified by mode and key,
        do data augmentation and bounding box cropping.
        """
        output = {}

        if mode == TRAIN_MODE:
            dconf = self.dataconf.random_data_generator()
            random_clip = True
        else:
            dconf = self.dataconf.get_fixed_config()
            random_clip = False

        if self.topology == 'sequences':
            seq_idx = key
            seq = self.sequences[mode][seq_idx]
            if frame_list == None:
                frame_list = get_clip_frame_index(len(seq.frames),
                        dconf['subspl'], self.clip_size,
                        random_clip=random_clip)
        else:
            seq_idx, frame_idx = self.frame_idx[mode][key]
            seq = self.sequences[mode][seq_idx]
            frame_list = [frame_idx]

        objframes = seq.frames[frame_list]

        """Load pose annotation"""
        pose, visible = self.get_pose_annot(objframes)
        w, h = (objframes[0].w, objframes[0].h)

        """Compute cropping bounding box, if not given."""
        if bbox is None:

            if self.use_gt_bbox:
                bbox = get_gt_bbox(pose[:, :, 0:2], visible, (w, h),
                        scale=dconf['scale'], logkey=key)

            elif self.pred_bboxes:
                bbox = compute_clip_bbox(
                        self.pred_bboxes[mode], seq_idx, frame_list)

            else:
                bbox = objposwin_to_bbox(np.array([w / 2, h / 2]),
                        (dconf['scale']*max(w, h), dconf['scale']*max(w, h)))

        objpos, winsize = bbox_to_objposwin(bbox)
        if min(winsize) < 32:
            winsize = (32, 32)
        objpos += dconf['scale'] * np.array([dconf['transx'], dconf['transy']])

        """Pre-process data for each frame"""
        if self.pose_only:
            frames = None
        else:
            frames = np.zeros((len(objframes),) + self.dataconf.input_shape)
            if self.output_fullframe:
                fullframes = np.zeros((len(objframes), h, w,
                    self.dataconf.input_shape[-1]))

        for i in range(len(objframes)):
            if self.pose_only:
                imgt = T(None, img_size=(w, h))
            else:
                image = 'frames/%04d/%06d.jpg' % (seq.idx, objframes[i].f)
                imgt = T(Image.open(os.path.join(self.dataset_path, image)))
                if self.output_fullframe:
                    fullframes[i, :, :, :] = normalize_channels(imgt.asarray(),
                            channel_power=dconf['chpower'])

            imgt.rotate_crop(dconf['angle'], objpos, winsize)
            imgt.resize(self.dataconf.crop_resolution)

            if dconf['hflip'] == 1:
                imgt.horizontal_flip()

            imgt.normalize_affinemap()
            if not self.pose_only:
                frames[i, :, :, :] = normalize_channels(imgt.asarray(),
                        channel_power=dconf['chpower'])

            pose[i, :, 0:2] = transform_2d_points(imgt.afmat, pose[i, :, 0:2],
                    transpose=True)
            if imgt.hflip:
                pose[i, :, :] = pose[i, self.poselayout.map_hflip, :]

        """Set outsider body joints to invalid (-1e9)."""
        pose = np.reshape(pose, (-1, self.poselayout.dim))
        pose[np.isnan(pose)] = -1e9
        v = np.expand_dims(get_visible_joints(pose[:,0:2]), axis=-1)
        pose[(v==0)[:,0],:] = -1e9
        pose = np.reshape(pose, (len(objframes), self.poselayout.num_joints,
            self.poselayout.dim))
        v = np.reshape(v, (len(objframes), self.poselayout.num_joints, 1))

        pose = np.concatenate((pose, v), axis=-1)
        if self.topology != 'sequences':
            pose = np.squeeze(pose, axis=0)
            if not self.pose_only:
                frames = np.squeeze(frames, axis=0)

        action = np.zeros(self.get_shape('pennaction'))
        action[seq.action_id - 1] = 1.

        output['seq_idx'] = seq_idx
        output['frame_list'] = frame_list
        output['pennaction'] = action
        output['ntuaction'] = np.zeros((60,))
        output['pose'] = pose
        output['frame'] = frames
        if self.output_fullframe and not self.pose_only:
            output['fullframe'] = fullframes

        output['bbox'] = bbox

        """Take the last transformation matrix, it should not change"""
        output['afmat'] = imgt.afmat.copy()

        return output

    def get_clip_index(self, key, mode, subsamples=[2]):
        assert self.topology == 'sequences', 'Topology not supported'

        seq = self.sequences[mode][key]
        index_list = []
        for sub in subsamples:
            start_frame = 0
            while True:
                last_frame = start_frame + self.clip_size * sub
                if last_frame > len(seq.frames):
                    break
                index_list.append(range(start_frame, last_frame, sub))
                start_frame += int(self.clip_size / 2) + (sub - 1)

        return index_list


    def get_pose_annot(self, frames):
        p = np.nan * np.ones((len(frames), self.poselayout.num_joints,
            self.poselayout.dim))
        v = np.zeros((len(frames), self.poselayout.num_joints))
        for i in range(len(frames)):
            p[i, self.poselayout.map_to_pa13j, 0:2] = frames[i].pose.copy().T
            v[i, self.poselayout.map_to_pa13j] = frames[i].visible.copy()
            p[i, v[i] == 0, :] = np.nan
            p[i, p[i] == 0] = np.nan

        return p, v

    def clip_length(self):
        if self.topology == 'sequences':
            return self.clip_size
        else:
            return None

    def clip_shape(self):
        if self.topology == 'sequences':
            return (self.clip_size,)
        else:
            return ()

    def get_shape(self, dictkey):
        if dictkey == 'frame':
            return self.clip_shape() + self.dataconf.input_shape
        if dictkey == 'pose':
            return self.clip_shape() \
                    + (self.poselayout.num_joints, self.poselayout.dim+1)
        if dictkey == 'pennaction':
            return (len(self.action_labels),)
        if dictkey == 'ntuaction':
            return (60,)
        if dictkey == 'afmat':
            return (3, 3)
        raise Exception('Invalid dictkey ({}) on get_shape!'.format(dictkey))

    def get_length(self, mode):
        if self.topology == 'sequences':
            return len(self.sequences[mode])
        else:
            return len(self.frame_idx[mode])


# **Basic Models before training**

In [None]:

def channel_softmax_1d():

    def _channel_softmax_1d(x):
        ndim = K.ndim(x)
        if ndim == 3:
            e = K.exp(x - K.max(x, axis=(1,), keepdims=True))
            s = K.sum(e, axis=(1,), keepdims=True)
            return e / s
        else:
            raise ValueError('This function is specific for 3D tensors. '
                    'Here, ndim=' + str(ndim))

    return _channel_softmax_1d



def relu(x, leakyrelu=False, name=None):
    if leakyrelu:
        return LeakyReLU(alpha=0.1)(x)
    else:
        return Activation('relu', name=name)(x)


def localconv1d(x, filters, kernel_size, strides=1, use_bias=True, name=None):
    """LocallyConnected1D possibly wrapped by a TimeDistributed layer."""
    f = LocallyConnected1D(filters, kernel_size, strides=strides,
            use_bias=use_bias, name=name)

    return TimeDistributed(f, name=name)(x) if K.ndim(x) == 4 else f(x)


def conv2d(x, filters, kernel_size, strides=(1, 1), padding='same', name=None):
    """Conv2D possibly wrapped by a TimeDistributed layer."""
    f = Conv2D(filters, kernel_size, strides=strides, padding=padding,
            use_bias=False, name=name)

    return TimeDistributed(f, name=name)(x) if K.ndim(x) == 5 else f(x)


def sepconv2d(x, filters, kernel_size, strides=(1, 1), padding='same',
        name=None):
    """SeparableConv2D possibly wrapped by a TimeDistributed layer."""
    f = SeparableConv2D(filters, kernel_size, strides=strides, padding=padding,
            use_bias=False, name=name)

    return TimeDistributed(f, name=name)(x) if K.ndim(x) == 5 else f(x)


def conv2dtranspose(x, filters, kernel_size, strides=(1, 1), padding='same',
        name=None):
    """Conv2DTranspose possibly wrapped by a TimeDistributed layer."""
    f = Conv2DTranspose(filters, kernel_size, strides=strides, padding=padding,
            use_bias=False, name=name)

    return TimeDistributed(f, name=name)(x) if K.ndim(x) == 5 else f(x)


def maxpooling2d(x, kernel_size=(2, 2), strides=(2, 2), padding='same',
        name=None):
    """MaxPooling2D possibly wrapped by a TimeDistributed layer."""
    f = MaxPooling2D(kernel_size, strides=strides, padding=padding, name=name)

    return TimeDistributed(f, name=name)(x) if K.ndim(x) == 5 else f(x)


def upsampling2d(x, kernel_size=(2, 2), name=None):
    """UpSampling2D possibly wrapped by a TimeDistributed layer."""
    f = UpSampling2D(kernel_size, name=name)

    return TimeDistributed(f, name=name)(x) if K.ndim(x) == 5 else f(x)


def keypoint_confidence(x, name=None):
    """Implements the keypoint (body joint) confidence, given a set of
    probability maps as input. No parameters required.
    """
    def _keypoint_confidence(x):
        x = 4 * AveragePooling2D((2, 2), strides=(1, 1))(x)
        x = K.expand_dims(GlobalMaxPooling2D()(x), axis=-1)

        return x

    f = Lambda(_keypoint_confidence, name=name)

    return TimeDistributed(f, name=name)(x) if K.ndim(x) == 5 else f(x)


def softargmax2d(x, limits=(0, 0, 1, 1), name=None):
    x_x = lin_interpolation_2d(x, axis=0, vmin=limits[0], vmax=limits[2],
            name=appstr(name, '_x'))
    x_y = lin_interpolation_2d(x, axis=1, vmin=limits[1], vmax=limits[3],
            name=appstr(name, '_y'))
    x = concatenate([x_x, x_y], name=name)

    return x


def lin_interpolation_1d(inp):

    depth, num_filters = K.int_shape(inp)[1:]
    conv = Conv1D(num_filters, depth, use_bias=False)
    x = conv(inp)

    w = conv.get_weights()
    w[0].fill(0)

    start = 1/(2*depth)
    end = 1 - start
    linspace = np.linspace(start, end, num=depth)

    for i in range(num_filters):
        w[0][:, i, i] = linspace[:]

    conv.set_weights(w)
    conv.trainable = False

    def _traspose(x):
       x = K.squeeze(x, axis=-2)
       x = K.expand_dims(x, axis=-1)
       return x
    x = Lambda(_traspose)(x)

    return x


def lin_interpolation_2d(x, axis, vmin=0., vmax=1., name=None):
    """Implements a 2D linear interpolation using a depth size separable
    convolution (non trainable).
    """
    assert K.ndim(x) in [4, 5], \
            'Input tensor must have ndim 4 or 5 ({})'.format(K.ndim(x))

    if 'global_sam_cnt' not in globals():
        global global_sam_cnt
        global_sam_cnt = 0

    if name is None:
        name = 'custom_sam_%d' % global_sam_cnt
        global_sam_cnt += 1

    if K.ndim(x) == 4:
        num_rows, num_cols, num_filters = K.int_shape(x)[1:]
    else:
        num_rows, num_cols, num_filters = K.int_shape(x)[2:]

    f = SeparableConv2D(num_filters, (num_rows, num_cols), use_bias=False,
            name=name)
    x = TimeDistributed(f, name=name)(x) if K.ndim(x) == 5 else f(x)

    w = f.get_weights()
    w[0].fill(0)
    w[1].fill(0)
    linspace = linspace_2d(num_rows, num_cols, dim=axis)

    for i in range(num_filters):
        w[0][:,:, i, 0] = linspace[:,:]
        w[1][0, 0, i, i] = 1.

    f.set_weights(w)
    f.trainable = False

    x = Lambda(lambda x: K.squeeze(x, axis=-2))(x)
    x = Lambda(lambda x: K.squeeze(x, axis=-2))(x)
    x = Lambda(lambda x: K.expand_dims(x, axis=-1))(x)

    return x

def conv_bn(x, filters, size, strides=(1, 1), padding='same', name=None):
    if name is not None:
        conv_name = name + '_conv'
    else:
        conv_name = None

    x = conv(x, filters, size, strides, padding, conv_name)
    x = BatchNormalization(axis=-1, scale=False, name=name)(x)
    return x


def deconv(x, filters, size, strides=(1, 1), padding='same', name=None):
    x = Conv2DTranspose(filters, size, strides=strides, padding=padding,
            data_format=K.image_data_format(), use_bias=False, name=name)(x)
    return x


def conv_act(x, filters, size, strides=(1, 1), padding='same', name=None):
    if name is not None:
        conv_name = name + '_conv'
    else:
        conv_name = None

    x = conv(x, filters, size, strides, padding, conv_name)
    x = Activation('relu', name=name)(x)
    return x


def conv_bn_act(x, filters, size, strides=(1, 1), padding='same', name=None):
    if name is not None:
        conv_name = name + '_conv'
        bn_name = name + '_bn'
    else:
        conv_name = None
        bn_name = None

    x = conv(x, filters, size, strides, padding, conv_name)
    x = BatchNormalization(axis=-1, scale=False, name=bn_name)(x)
    x = Activation('relu', name=name)(x)
    return x


def bn_act_conv(x, filters, size, strides=(1, 1), padding='same', name=None):
    if name is not None:
        bn_name = name + '_bn'
        act_name = name + '_act'
    else:
        bn_name = None
        act_name = None

    x = BatchNormalization(axis=-1, scale=False, name=bn_name)(x)
    x = Activation('relu', name=act_name)(x)
    x = conv(x, filters, size, strides, padding, name)
    return x


def act_conv_bn(x, filters, size, strides=(1, 1), padding='same', name=None):
    if name is not None:
        conv_name = name + '_conv'
        act_name = name + '_act'
    else:
        conv_name = None
        act_name = None

    x = Activation('relu', name=act_name)(x)
    x = conv(x, filters, size, strides, padding, conv_name)
    x = BatchNormalization(axis=-1, scale=False, name=name)(x)
    return x


def separable_conv_bn_act(x, filters, size, strides=(1, 1), padding='same',
        name=None):
    if name is not None:
        conv_name = name + '_conv'
        bn_name = name + '_bn'
    else:
        conv_name = None
        bn_name = None

    x = SeparableConv2D(filters, size, strides=strides, padding=padding,
            use_bias=False, name=conv_name)(x)
    x = BatchNormalization(axis=-1, scale=False, name=bn_name)(x)
    x = Activation('relu', name=name)(x)
    return x


def separable_act_conv_bn(x, filters, size, strides=(1, 1), padding='same',
        name=None):
    if name is not None:
        conv_name = name + '_conv'
        act_name = name + '_act'
    else:
        conv_name = None
        act_name = None

    x = Activation('relu', name=act_name)(x)
    x = SeparableConv2D(filters, size, strides=strides, padding=padding,
            use_bias=False, name=conv_name)(x)
    x = BatchNormalization(axis=-1, scale=False, name=name)(x)
    return x


def separable_conv_bn(x, filters, size, strides=(1, 1), padding='same',
        name=None):
    if name is not None:
        conv_name = name + '_conv'
    else:
        conv_name = None

    x = SeparableConv2D(filters, size, strides=strides, padding=padding,
            use_bias=False, name=conv_name)(x)
    x = BatchNormalization(axis=-1, scale=False, name=name)(x)
    return x


def act_conv(x, filters, size, strides=(1, 1), padding='same', name=None):
    if name is not None:
        act_name = name + '_act'
    else:
        act_name = None

    x = Activation('relu', name=act_name)(x)
    x = conv(x, filters, size, strides, padding, name)
    return x

def bn_act_conv3d(x, filters, size, strides=(1, 1, 1), padding='same',
        name=None):

    if name is not None:
        bn_name = name + '_bn'
        act_name = name + '_act'
    else:
        bn_name = None
        act_name = None

    x = BatchNormalization(axis=-1, scale=False, name=bn_name)(x)
    x = Activation('relu', name=act_name)(x)
    x = Conv3D(filters, size, strides=strides, padding=padding,
            use_bias=False, name=name)(x)
    return x


def dense(x, filters, name=None):
    x = Dense(filters, kernel_regularizer=l1(0.001), name=name)(x)
    return x


def bn_act_dense(x, filters, name=None):
    if name is not None:
        bn_name = name + '_bn'
        act_name = name + '_act'
    else:
        bn_name = None
        act_name = None

    x = BatchNormalization(axis=-1, scale=False, name=bn_name)(x)
    x = Activation('relu', name=act_name)(x)
    x = Dense(filters, kernel_regularizer=l1(0.001), name=name)(x)
    return x


def act_channel_softmax(x, name=None):
    x = Activation(channel_softmax_2d(), name=name)(x)
    return x


def act_depth_softmax(x, name=None):
    x = Activation(channel_softmax_1d(), name=name)(x)
    return x


def aggregate_position_probability(inp):
    y,p = inp

    p = concatenate([p, p], axis=-1)
    yp = p * y
    yn = (1 - p) * y
    y = concatenate([yp, yn], axis=-1)

    return y


def fc_aggregation_block(y, p, name=None):
    dim = K.int_shape(y)[-1]

    x = Lambda(aggregate_position_probability, name=name)([y, p])
    x = Dense(2*dim, use_bias=False, kernel_regularizer=l1(0.0002),
            name=name + '_fc1')(x)
    x = Activation('relu', name=name + '_act')(x)
    x = Dense(dim, kernel_regularizer=l1(0.0002), name=name + '_fc2')(x)

    return x


def sparse_fc_mapping(x, input_idxs):

    num_units = len(input_idxs)
    d = Dense(num_units, use_bias=False)
    d.trainable = False
    x = d(x)

    w = d.get_weights()
    w[0].fill(0)
    for i in range(num_units):
        w[0][input_idxs[i], i] = 1.
    d.set_weights(w)

    return x

def max_min_pooling(x, strides=(2, 2), padding='same', name=None):
    if 'max_min_pool_cnt' not in globals():
        global max_min_pool_cnt
        max_min_pool_cnt = 0

    if name is None:
        name = 'MaxMinPooling2D_%d' % max_min_pool_cnt
        max_min_pool_cnt += 1

    def _max_plus_min(x):
        x1 = MaxPooling2D(strides, padding=padding)(x)
        x2 = MaxPooling2D(strides, padding=padding)(-x)
        return x1 - x2

    return Lambda(_max_plus_min, name=name)(x)


def global_max_min_pooling(x, name=None):
    if 'global_max_min_pool_cnt' not in globals():
        global global_max_min_pool_cnt
        global_max_min_pool_cnt = 0

    if name is None:
        name = 'GlobalMaxMinPooling2D_%d' % global_max_min_pool_cnt
        global_max_min_pool_cnt += 1

    def _global_max_plus_min(x):
        x1 = GlobalMaxPooling2D()(x)
        x2 = GlobalMaxPooling2D()(-x)
        return x1 - x2

    return Lambda(_global_max_plus_min, name=name)(x)


def kl_divergence_regularizer(x, rho=0.01):

    def _kl_regularizer(y_pred):
        _, rows, cols, _ = K.int_shape(y_pred)
        vmax = K.max(y_pred, axis=(1, 2))
        vmax = K.expand_dims(vmax, axis=(1))
        vmax = K.expand_dims(vmax, axis=(1))
        vmax = K.tile(vmax, [1, rows, cols, 1])
        y_delta = K.cast(K.greater_equal(y_pred, vmax), 'float32')
        return rho * K.sum(y_pred *
                (K.log(K.clip(y_pred, K.epsilon(), 1.))
                - K.log(K.clip(y_delta, K.epsilon(), 1.))) / (rows * cols)
            )

    # Build an auxiliary non trainable layer, just to use the activity reg.
    num_filters = K.int_shape(x)[-1]
    aux_conv = Conv2D(num_filters, (1, 1), use_bias=False,
            activity_regularizer=_kl_regularizer)
    aux_conv.trainable = False
    x = aux_conv(x)

    # Set identity weights
    w = aux_conv.get_weights()
    w[0].fill(0)

    for i in range(num_filters):
        w[0][0,0,i,i] = 1.

    aux_conv.set_weights(w)

    return x


def kronecker_prod(h, f, name='Kronecker_prod'):
    """ # Inputs: inp[0] (heatmaps) and inp[1] (visual features)
    """
    inp = [h, f]
    def _combine_heatmaps_visual(inp):
        hm = inp[0]
        x = inp[1]
        nj = K.int_shape(hm)[-1]
        nf = K.int_shape(x)[-1]
        hm = K.expand_dims(hm, axis=-1)
        if len(K.int_shape(hm)) == 6:
            hm = K.tile(hm, [1, 1, 1, 1, 1, nf])
        elif len(K.int_shape(hm)) == 5:
            hm = K.tile(hm, [1, 1, 1, 1, nf])
        else:
            raise ValueError(f'Invalid heatmap shape {hm}')

        x = K.expand_dims(x, axis=-2)
        if len(K.int_shape(x)) == 6:
            x = K.tile(x, [1, 1, 1, 1, nj, 1])
        elif len(K.int_shape(x)) == 5:
            x = K.tile(x, [1, 1, 1, nj, 1])
        else:
            raise ValueError(f'Invalid featuremap shape {x}')

        x = hm * x
        x = K.sum(x, axis=(2, 3))

        return x

    return Lambda(_combine_heatmaps_visual, name=name)(inp)


# Aliases.
conv = conv2d

In [None]:

def channel_softmax_2d(alpha=1):

    def _channel_softmax_2d(x):
        assert K.ndim(x) in [4, 5], \
                'Input tensor must have ndim 4 or 5 ({})'.format(K.ndim(x))

        if alpha != 1:
            x = alpha * x
        e = K.exp(x - K.max(x, axis=(-3, -2), keepdims=True))
        s = K.clip(K.sum(e, axis=(-3, -2), keepdims=True), K.epsilon(), None)

        return e / s

    return _channel_softmax_2d


def build_context_aggregation(num_joints, num_context, alpha,
        num_frames=1, name=None):

    inp = Input(shape=(num_joints * num_context, 1))
    d = Dense(num_joints, use_bias=False)

    x = Lambda(lambda x: K.squeeze(x, axis=-1))(inp)
    x = d(x)
    x = Lambda(lambda x: K.expand_dims(x, axis=-1))(x)

    w = d.get_weights()
    w[0].fill(0)
    for j in range(num_joints):
        start = j*num_context
        w[0][j * num_context : (j + 1) * num_context, j] = 1.
    d.set_weights(w)
    d.trainable = False

    ctx_sum = Model(inputs=inp, outputs=x)
    ctx_sum.trainable = False
    if num_frames > 1:
        ctx_sum = TimeDistributed(ctx_sum,
                input_shape=(num_frames,) + K.int_shape(inp)[1:])

    # Define auxiliary layers.
    mul_alpha = Lambda(lambda x: alpha * x)
    mul_1alpha = Lambda(lambda x: (1 - alpha) * x)

    # This depends on TensorFlow because keras does not implement divide.
    tf_div = Lambda(lambda x: tf.divide(x[0], x[1]))

    if num_frames == 1:
        # Define inputs
        ys = Input(shape=(num_joints, 2))
        yc = Input(shape=(num_joints * num_context, 2))
        pc = Input(shape=(num_joints * num_context, 1))

        # Split contextual predictions in x and y and do computations separately
        xi = Lambda(lambda x: x[:,:, 0:1])(yc)
        yi = Lambda(lambda x: x[:,:, 1:2])(yc)
    else:
        ys = Input(shape=(num_frames, num_joints, 2))
        yc = Input(shape=(num_frames, num_joints * num_context, 2))
        pc = Input(shape=(num_frames, num_joints * num_context, 1))

        # Split contextual predictions in x and y and do computations separately
        xi = Lambda(lambda x: x[:,:,:, 0:1])(yc)
        yi = Lambda(lambda x: x[:,:,:, 1:2])(yc)

    pxi = multiply([xi, pc])
    pyi = multiply([yi, pc])

    pc_sum = ctx_sum(pc)
    pxi_sum = ctx_sum(pxi)
    pyi_sum = ctx_sum(pyi)
    pc_div = Lambda(lambda x: x / num_context)(pc_sum)
    pxi_div = tf_div([pxi_sum, pc_sum])
    pyi_div = tf_div([pyi_sum, pc_sum])
    yc_div = concatenate([pxi_div, pyi_div])

    ys_alpha = mul_alpha(ys)
    yc_div_1alpha = mul_1alpha(yc_div)

    y = add([ys_alpha, yc_div_1alpha])

    model = Model(inputs=[ys, yc, pc], outputs=y, name=name)
    model.trainable = False

    return model

def _reset_invalid_joints(y_true, y_pred):
    """Reset (set to zero) invalid joints, according to y_true, and compute the
    number of valid joints.
    """
    idx = K.cast(K.greater(y_true, 0.), 'float32')
    y_true = idx * y_true
    y_pred = idx * y_pred
    num_joints = K.clip(K.sum(idx, axis=(-1, -2)), 1, None)
    return y_true, y_pred, num_joints


def elasticnet_loss_on_valid_joints(y_true, y_pred):
    y_true, y_pred, num_joints = _reset_invalid_joints(y_true, y_pred)
    l1 = K.sum(K.abs(y_pred - y_true), axis=(-1, -2)) / num_joints
    l2 = K.sum(K.square(y_pred - y_true), axis=(-1, -2)) / num_joints
    return l1 + l2

def action_top(x, name=None):
    x = global_max_min_pooling(x)
    x = Activation('softmax', name=name)(x)
    return x


def build_act_pred_block(x, num_out, name=None, last=False, include_top=True):

    num_features = K.int_shape(x)[-1]

    ident = x
    x = act_conv_bn(x, int(num_features/2), (1, 1))
    x = act_conv_bn(x, num_features, (3, 3))
    x = add([ident, x])

    ident = x
    x1 = act_conv_bn(x, num_features, (3, 3))
    x = max_min_pooling(x1, (2, 2))
    action_hm = act_conv(x, num_out, (3, 3))
    y = action_hm
    if include_top:
        y = action_top(y)

    if not last:
        action_hm = UpSampling2D((2, 2))(action_hm)
        action_hm = act_conv_bn(action_hm, num_features, (3, 3))
        x = add([ident, x1, action_hm])

    return x, y


def build_pose_model(num_joints, num_actions, num_temp_frames=None, pose_dim=2,
        name=None, include_top=True, network_version='v1'):

    y = Input(shape=(num_temp_frames, num_joints, pose_dim))
    p = Input(shape=(num_temp_frames, num_joints, 1))

    ## Pose information
    mask = Lambda(lambda x: K.tile(x, [1, 1, 1, pose_dim]))(p)
    x = Lambda(lambda x: x[0] * x[1])([y, mask])

    if network_version == 'v1':
        a = conv_bn_act(x, 8, (3, 1))
        b = conv_bn_act(x, 16, (3, 3))
        c = conv_bn_act(x, 24, (3, 5))
        x = concatenate([a, b, c])
        a = conv_bn(x, 56, (3, 3))
        b = conv_bn(x, 32, (1, 1))
        b = conv_bn(b, 56, (3, 3))
        x = concatenate([a, b])
        x = max_min_pooling(x, (2, 2))
    elif network_version == 'v2':
        a = conv_bn_act(x, 12, (3, 1))
        b = conv_bn_act(x, 24, (3, 3))
        c = conv_bn_act(x, 36, (3, 5))
        x = concatenate([a, b, c])
        a = conv_bn(x, 112, (3, 3))
        b = conv_bn(x, 64, (1, 1))
        b = conv_bn(b, 112, (3, 3))
        x = concatenate([a, b])
        x = max_min_pooling(x, (2, 2))
    else:
        raise Exception('Unkown network version "{}"'.format(network_version))

    x, y1 = build_act_pred_block(x, num_actions, name='y1',
            include_top=include_top)
    x, y2 = build_act_pred_block(x, num_actions, name='y2',
            include_top=include_top)
    x, y3 = build_act_pred_block(x, num_actions, name='y3',
            include_top=include_top)
    _, y4 = build_act_pred_block(x, num_actions, name='y4',
            include_top=include_top, last=True)
    x = [y1, y2, y3, y4]

    model = Model(inputs=[y, p], outputs=x, name=name)

    return model


def build_visual_model(num_joints, num_actions, num_features,
        num_temp_frames=None, name=None, include_top=True):

    inp = Input(shape=(num_temp_frames, num_joints, num_features))
    x = conv_bn(inp, 256, (1, 1))
    x = MaxPooling2D((2, 2))(x)
    x, y1 = build_act_pred_block(x, num_actions, name='y1',
            include_top=include_top)
    x, y2 = build_act_pred_block(x, num_actions, name='y2',
            include_top=include_top)
    x, y3 = build_act_pred_block(x, num_actions, name='y3',
            include_top=include_top)
    _, y4 = build_act_pred_block(x, num_actions, name='y4',
            include_top=include_top, last=True)
    model = Model(inp, [y1, y2, y3, y4], name=name)

    return model


def _get_2d_pose_estimation_from_model(inp, model_pe, num_joints, num_blocks,
        num_context_per_joint, full_trainable=False):

    num_frames = K.int_shape(inp)[1]

    stem = model_pe.get_layer('Stem')
    stem.trainable = full_trainable

    i = 1
    recep_block = model_pe.get_layer('rBlock%d' % i)
    recep_block.trainable = full_trainable

    x1 = TimeDistributed(stem, name='td_%s' % stem.name)(inp)
    xb1 = TimeDistributed(recep_block, name='td_%s' % recep_block.name)(x1)

    inp_pe = Input(shape=K.int_shape(xb1)[2:])
    sep_conv = model_pe.get_layer('SepConv%d' % i)
    reg_map = model_pe.get_layer('RegMap%d' % i)
    fre_map = model_pe.get_layer('fReMap%d' % i)
    x2 = sep_conv(inp_pe)
    x3 = fre_map(reg_map(x2))
    x = add([inp_pe, x2, x3])

    for i in range(2, num_blocks):
        recep_block = model_pe.get_layer('rBlock%d' % i)
        sep_conv = model_pe.get_layer('SepConv%d' % i)
        reg_map = model_pe.get_layer('RegMap%d' % i)
        fre_map = model_pe.get_layer('fReMap%d' % i)
        x1 = recep_block(x)
        x2 = sep_conv(x1)
        x3 = fre_map(reg_map(x2))
        x = add([x1, x2, x3])

    recep_block = model_pe.get_layer('rBlock%d' % num_blocks)
    sep_conv = model_pe.get_layer('SepConv%d' % num_blocks)
    reg_map = model_pe.get_layer('RegMap%d' % num_blocks)
    x = recep_block(x)
    x = sep_conv(x)
    x = reg_map(x)

    model1 = Model(inp_pe, x, name='PoseReg')
    model1.trainable = full_trainable

    num_heatmaps = (num_context_per_joint + 1) * num_joints
    num_rows = K.int_shape(model1.output)[1]
    num_cols = K.int_shape(model1.output)[2]

    sams_input_shape = (num_frames, num_rows, num_cols, num_joints)
    samc_input_shape = \
            (num_frames, num_rows, num_cols, num_heatmaps - num_joints)

    # Build the time distributed models
    model_pe.get_layer('sSAM').trainable = full_trainable
    sam_s_model = TimeDistributed(model_pe.get_layer('sSAM'),
            input_shape=sams_input_shape, name='sSAM')

    if num_context_per_joint > 0:
        model_pe.get_layer('cSAM').trainable = full_trainable
        sam_c_model = TimeDistributed(model_pe.get_layer('cSAM'),
                input_shape=samc_input_shape, name='cSAM')

    model_pe.get_layer('sjProb').trainable = False
    jprob_s_model = TimeDistributed(model_pe.get_layer('sjProb'),
            input_shape=sams_input_shape, name='sjProb')

    if num_context_per_joint > 0:
        model_pe.get_layer('cjProb').trainable = False
        jprob_c_model = TimeDistributed(model_pe.get_layer('cjProb'),
                input_shape=samc_input_shape, name='cjProb')

    agg_model = build_context_aggregation(num_joints,
            num_context_per_joint, 0.8, num_frames=num_frames, name='Agg')

    h = TimeDistributed(model1, name='td_Model1')(xb1)
    if num_context_per_joint > 0:
        hs = Lambda(lambda x: x[:,:,:,:, :num_joints])(h)
        hc = Lambda(lambda x: x[:,:,:,:, num_joints:])(h)
    else:
        hs = h

    ys = sam_s_model(hs)
    if num_context_per_joint > 0:
        yc = sam_c_model(hc)
        pc = jprob_c_model(hc)
        y = agg_model([ys, yc, pc])
    else:
        y = ys

    p = jprob_s_model(Lambda(lambda x: 4*x)(hs))

    hs = TimeDistributed(Activation(channel_softmax_2d()),
            name='td_ChannelSoftmax')(hs)

    return y, p, hs, xb1


def _get_3d_pose_estimation_from_model(inp, model_pe, num_joints, num_blocks,
        depth_maps, full_trainable=False):

    num_frames = K.int_shape(inp)[1]

    model_pe.summary()

    stem = model_pe.get_layer('Stem')
    stem.trainable = full_trainable

    i = 1
    recep_block = model_pe.get_layer('rBlock%d' % i)
    recep_block.trainable = full_trainable

    x1 = TimeDistributed(stem, name='td_%s' % stem.name)(inp)
    xb1 = TimeDistributed(recep_block, name='td_%s' % recep_block.name)(x1)

    inp_pe = Input(shape=K.int_shape(xb1)[2:])
    sep_conv = model_pe.get_layer('SepConv%d' % i)
    reg_map = model_pe.get_layer('RegMap%d' % i)
    fre_map = model_pe.get_layer('fReMap%d' % i)
    x2 = sep_conv(inp_pe)
    x3 = fre_map(reg_map(x2))
    x = add([inp_pe, x2, x3])

    for i in range(2, num_blocks):
        recep_block = model_pe.get_layer('rBlock%d' % i)
        sep_conv = model_pe.get_layer('SepConv%d' % i)
        reg_map = model_pe.get_layer('RegMap%d' % i)
        fre_map = model_pe.get_layer('fReMap%d' % i)
        x1 = recep_block(x)
        x2 = sep_conv(x1)
        x3 = fre_map(reg_map(x2))
        x = add([x1, x2, x3])

    recep_block = model_pe.get_layer('rBlock%d' % num_blocks)
    sep_conv = model_pe.get_layer('SepConv%d' % num_blocks)
    reg_map = model_pe.get_layer('RegMap%d' % num_blocks)
    x = recep_block(x)
    x = sep_conv(x)
    x = reg_map(x)

    model1 = Model(inp_pe, x, name='PoseReg')
    model1.trainable = full_trainable

    num_rows = K.int_shape(model1.output)[1]
    num_cols = K.int_shape(model1.output)[2]

    sams_input_shape = (num_frames, num_rows, num_cols, num_joints)
    samz_input_shape = (num_frames, depth_maps, num_joints)

    # Build the time distributed models
    model_pe.get_layer('sSAM').trainable = full_trainable
    sam_s_model = TimeDistributed(model_pe.get_layer('sSAM'),
            input_shape=sams_input_shape, name='sSAM')

    model_pe.get_layer('zSAM').trainable = full_trainable
    sam_z_model = TimeDistributed(model_pe.get_layer('zSAM'),
            input_shape=samz_input_shape, name='zSAM')

    h = TimeDistributed(model1, name='td_Model1')(xb1)
    assert K.int_shape(h)[-1] == depth_maps * num_joints

    def _reshape_heatmaps(x):
        x = K.expand_dims(x, axis=-1)
        x = K.reshape(x, (-1, K.int_shape(x)[1], K.int_shape(x)[2],
            K.int_shape(x)[3], depth_maps, num_joints))

        return x

    h = Lambda(_reshape_heatmaps)(h)
    hxy = Lambda(lambda x: K.mean(x, axis=4))(h)
    hz = Lambda(lambda x: K.mean(x, axis=(2, 3)))(h)

    pxy = sam_s_model(hxy)
    pz = sam_z_model(hz)
    pose = concatenate([pxy, pz])

    vxy = TimeDistributed(GlobalMaxPooling2D(), name='td_GlobalMaxPooling2D',
            input_shape=K.int_shape(hxy)[1:])(hxy)
    vz = TimeDistributed(GlobalMaxPooling1D(), name='td_GlobalMaxPooling1D',
            input_shape=K.int_shape(hz)[1:])(hz)
    v = add([vxy, vz])
    v = Lambda(lambda x: 2*K.expand_dims(x, axis=-1))(v)
    visible = Activation('sigmoid')(v)

    hxy = TimeDistributed(Activation(channel_softmax_2d()),
            name='td_ChannelSoftmax')(hxy)

    return pose, visible, hxy, xb1


def build_guided_visual_model(model_pe, num_actions, input_shape, num_frames,
        num_joints, num_blocks, num_context_per_joint=2):

    inp = Input(shape=(num_frames,) + input_shape)
    _, _, hs, xb1 = _get_2d_pose_estimation_from_model(inp, model_pe, num_joints,
            num_blocks, num_context_per_joint,
            num_context_per_joint=num_context_per_joint)

    f = kronecker_prod(hs, xb1)
    num_features = K.int_shape(f)[-1]
    model_ar = build_visual_model(num_joints, num_actions, num_features,
            num_temp_frames=num_frames, name='GuidedVisAR')

    x = model_ar(f)
    model = Model(inp, x)

    return model


def build_merge_model(model_pe,
        num_actions,
        input_shape,
        num_frames,
        num_joints,
        num_blocks,
        pose_dim=2,
        depth_maps=8,
        num_context_per_joint=2,
        pose_net_version='v1',
        output_poses=False,
        weighted_merge=True,
        ar_pose_weights=None,
        ar_visual_weights=None,
        full_trainable=False):

    inp = Input(shape=(num_frames,) + input_shape)
    outputs = []

    if pose_dim == 2:
        y, p, hs, xb1 = _get_2d_pose_estimation_from_model(inp, model_pe,
                num_joints, num_blocks, num_context_per_joint,
                full_trainable=full_trainable)
    elif pose_dim == 3:
        y, p, hs, xb1 = _get_3d_pose_estimation_from_model(inp, model_pe,
                num_joints, num_blocks, depth_maps,
                full_trainable=full_trainable)

    if output_poses:
        outputs.append(y)
        outputs.append(p)

    model_pose = build_pose_model(num_joints, num_actions, num_frames,
            pose_dim=pose_dim, include_top=False, name='PoseAR',
            network_version=pose_net_version)
    # model_pose.trainable = False
    if ar_pose_weights is not None:
        model_pose.load_weights(ar_pose_weights)
    out_pose = model_pose([y, p])

    f = kronecker_prod(hs, xb1)
    num_features = K.int_shape(f)[-1]
    model_vis = build_visual_model(num_joints, num_actions, num_features,
            num_temp_frames=num_frames, include_top=False, name='GuidedVisAR')
    # model_vis.trainable = False
    if ar_visual_weights is not None:
        model_vis.load_weights(ar_visual_weights)
    out_vis = model_vis(f)

    for i in range(len(out_pose)):
        outputs.append(action_top(out_pose[i], name='p%d' % (i+1)))

    for i in range(len(out_vis)):
        outputs.append(action_top(out_vis[i], name='v%d' % (i+1)))

    p = out_pose[-1]
    v = out_vis[-1]

    def _heatmap_weighting(inp):
        num_filters = K.int_shape(inp)[-1]
        conv = SeparableConv2D(num_filters, (1, 1),
                use_bias=False)
        x = conv(inp)
        w = conv.get_weights()
        w[0].fill(1.)
        w[1].fill(0)
        for i in range(num_filters):
            w[1][0, 0, i, i] = 1.
        conv.set_weights(w)

        return x

    if weighted_merge:
        p = _heatmap_weighting(p)
        v = _heatmap_weighting(v)

    m = add([p, v])
    outputs.append(action_top(m, name='m'))

    model = Model(inp, outputs)

    return model


def compile(model, lr=0.001, momentum=0.95, loss_weights=None,
        pose_predicted=False):

    if pose_predicted:
        losses = []
        losses.append(elasticnet_loss_on_valid_joints)
        losses.append('binary_crossentropy')
        for i in range(len(model.output) - 2):
            losses.append('categorical_crossentropy')

        model.compile(loss=losses,
                optimizer=SGD(lr=lr, momentum=momentum, nesterov=True),
                loss_weights=loss_weights)
    else:
        model.compile(loss='categorical_crossentropy',
                optimizer=SGD(lr=lr, momentum=momentum, nesterov=True),
                metrics=['acc'], loss_weights=loss_weights)



In [None]:

def channel_softmax_2d(alpha=1):

    def _channel_softmax_2d(x):
        assert K.ndim(x) in [4, 5], \
                'Input tensor must have ndim 4 or 5 ({})'.format(K.ndim(x))

        if alpha != 1:
            x = alpha * x
        e = K.exp(x - K.max(x, axis=(-3, -2), keepdims=True))
        s = K.clip(K.sum(e, axis=(-3, -2), keepdims=True), K.epsilon(), None)

        return e / s

    return _channel_softmax_2d


def build_context_aggregation(num_joints, num_context, alpha,
        num_frames=1, name=None):

    inp = Input(shape=(num_joints * num_context, 1))
    d = Dense(num_joints, use_bias=False)

    x = Lambda(lambda x: K.squeeze(x, axis=-1))(inp)
    x = d(x)
    x = Lambda(lambda x: K.expand_dims(x, axis=-1))(x)

    w = d.get_weights()
    w[0].fill(0)
    for j in range(num_joints):
        start = j*num_context
        w[0][j * num_context : (j + 1) * num_context, j] = 1.
    d.set_weights(w)
    d.trainable = False

    ctx_sum = Model(inputs=inp, outputs=x)
    ctx_sum.trainable = False
    if num_frames > 1:
        ctx_sum = TimeDistributed(ctx_sum,
                input_shape=(num_frames,) + K.int_shape(inp)[1:])

    # Define auxiliary layers.
    mul_alpha = Lambda(lambda x: alpha * x)
    mul_1alpha = Lambda(lambda x: (1 - alpha) * x)

    # This depends on TensorFlow because keras does not implement divide.
    tf_div = Lambda(lambda x: tf.divide(x[0], x[1]))

    if num_frames == 1:
        # Define inputs
        ys = Input(shape=(num_joints, 2))
        yc = Input(shape=(num_joints * num_context, 2))
        pc = Input(shape=(num_joints * num_context, 1))

        # Split contextual predictions in x and y and do computations separately
        xi = Lambda(lambda x: x[:,:, 0:1])(yc)
        yi = Lambda(lambda x: x[:,:, 1:2])(yc)
    else:
        ys = Input(shape=(num_frames, num_joints, 2))
        yc = Input(shape=(num_frames, num_joints * num_context, 2))
        pc = Input(shape=(num_frames, num_joints * num_context, 1))

        # Split contextual predictions in x and y and do computations separately
        xi = Lambda(lambda x: x[:,:,:, 0:1])(yc)
        yi = Lambda(lambda x: x[:,:,:, 1:2])(yc)

    pxi = multiply([xi, pc])
    pyi = multiply([yi, pc])

    pc_sum = ctx_sum(pc)
    pxi_sum = ctx_sum(pxi)
    pyi_sum = ctx_sum(pyi)
    pc_div = Lambda(lambda x: x / num_context)(pc_sum)
    pxi_div = tf_div([pxi_sum, pc_sum])
    pyi_div = tf_div([pyi_sum, pc_sum])
    yc_div = concatenate([pxi_div, pyi_div])

    ys_alpha = mul_alpha(ys)
    yc_div_1alpha = mul_1alpha(yc_div)

    y = add([ys_alpha, yc_div_1alpha])

    model = Model(inputs=[ys, yc, pc], outputs=y, name=name)
    model.trainable = False

    return model

def _reset_invalid_joints(y_true, y_pred):
    """Reset (set to zero) invalid joints, according to y_true, and compute the
    number of valid joints.
    """
    idx = K.cast(K.greater(y_true, 0.), 'float32')
    y_true = idx * y_true
    y_pred = idx * y_pred
    num_joints = K.clip(K.sum(idx, axis=(-1, -2)), 1, None)
    return y_true, y_pred, num_joints


def elasticnet_loss_on_valid_joints(y_true, y_pred):
    y_true, y_pred, num_joints = _reset_invalid_joints(y_true, y_pred)
    l1 = K.sum(K.abs(y_pred - y_true), axis=(-1, -2)) / num_joints
    l2 = K.sum(K.square(y_pred - y_true), axis=(-1, -2)) / num_joints
    return l1 + l2

def action_top(x, name=None):
    x = global_max_min_pooling(x)
    x = Activation('softmax', name=name)(x)
    return x


def build_act_pred_block(x, num_out, name=None, last=False, include_top=True):

    num_features = K.int_shape(x)[-1]

    ident = x
    x = act_conv_bn(x, int(num_features/2), (1, 1))
    x = act_conv_bn(x, num_features, (3, 3))
    x = add([ident, x])

    ident = x
    x1 = act_conv_bn(x, num_features, (3, 3))
    x = max_min_pooling(x1, (2, 2))
    action_hm = act_conv(x, num_out, (3, 3))
    y = action_hm
    if include_top:
        y = action_top(y)

    if not last:
        action_hm = UpSampling2D((2, 2))(action_hm)
        action_hm = act_conv_bn(action_hm, num_features, (3, 3))
        x = add([ident, x1, action_hm])

    return x, y


def build_pose_model(num_joints, num_actions, num_temp_frames=None, pose_dim=2,
        name=None, include_top=True, network_version='v1'):

    y = Input(shape=(num_temp_frames, num_joints, pose_dim))
    p = Input(shape=(num_temp_frames, num_joints, 1))

    ## Pose information
    mask = Lambda(lambda x: K.tile(x, [1, 1, 1, pose_dim]))(p)
    x = Lambda(lambda x: x[0] * x[1])([y, mask])

    if network_version == 'v1':
        a = conv_bn_act(x, 8, (3, 1))
        b = conv_bn_act(x, 16, (3, 3))
        c = conv_bn_act(x, 24, (3, 5))
        x = concatenate([a, b, c])
        a = conv_bn(x, 56, (3, 3))
        b = conv_bn(x, 32, (1, 1))
        b = conv_bn(b, 56, (3, 3))
        x = concatenate([a, b])
        x = max_min_pooling(x, (2, 2))
    elif network_version == 'v2':
        a = conv_bn_act(x, 12, (3, 1))
        b = conv_bn_act(x, 24, (3, 3))
        c = conv_bn_act(x, 36, (3, 5))
        x = concatenate([a, b, c])
        a = conv_bn(x, 112, (3, 3))
        b = conv_bn(x, 64, (1, 1))
        b = conv_bn(b, 112, (3, 3))
        x = concatenate([a, b])
        x = max_min_pooling(x, (2, 2))
    else:
        raise Exception('Unkown network version "{}"'.format(network_version))

    x, y1 = build_act_pred_block(x, num_actions, name='y1',
            include_top=include_top)
    x, y2 = build_act_pred_block(x, num_actions, name='y2',
            include_top=include_top)
    x, y3 = build_act_pred_block(x, num_actions, name='y3',
            include_top=include_top)
    _, y4 = build_act_pred_block(x, num_actions, name='y4',
            include_top=include_top, last=True)
    x = [y1, y2, y3, y4]

    model = Model(inputs=[y, p], outputs=x, name=name)

    return model


def build_visual_model(num_joints, num_actions, num_features,
        num_temp_frames=None, name=None, include_top=True):

    inp = Input(shape=(num_temp_frames, num_joints, num_features))
    x = conv_bn(inp, 256, (1, 1))
    x = MaxPooling2D((2, 2))(x)
    x, y1 = build_act_pred_block(x, num_actions, name='y1',
            include_top=include_top)
    x, y2 = build_act_pred_block(x, num_actions, name='y2',
            include_top=include_top)
    x, y3 = build_act_pred_block(x, num_actions, name='y3',
            include_top=include_top)
    _, y4 = build_act_pred_block(x, num_actions, name='y4',
            include_top=include_top, last=True)
    model = Model(inp, [y1, y2, y3, y4], name=name)

    return model


def _get_2d_pose_estimation_from_model(inp, model_pe, num_joints, num_blocks,
        num_context_per_joint, full_trainable=False):

    num_frames = K.int_shape(inp)[1]

    stem = model_pe.get_layer('Stem')
    stem.trainable = full_trainable

    i = 1
    recep_block = model_pe.get_layer('rBlock%d' % i)
    recep_block.trainable = full_trainable

    x1 = TimeDistributed(stem, name='td_%s' % stem.name)(inp)
    xb1 = TimeDistributed(recep_block, name='td_%s' % recep_block.name)(x1)

    inp_pe = Input(shape=K.int_shape(xb1)[2:])
    sep_conv = model_pe.get_layer('SepConv%d' % i)
    reg_map = model_pe.get_layer('RegMap%d' % i)
    fre_map = model_pe.get_layer('fReMap%d' % i)
    x2 = sep_conv(inp_pe)
    x3 = fre_map(reg_map(x2))
    x = add([inp_pe, x2, x3])

    for i in range(2, num_blocks):
        recep_block = model_pe.get_layer('rBlock%d' % i)
        sep_conv = model_pe.get_layer('SepConv%d' % i)
        reg_map = model_pe.get_layer('RegMap%d' % i)
        fre_map = model_pe.get_layer('fReMap%d' % i)
        x1 = recep_block(x)
        x2 = sep_conv(x1)
        x3 = fre_map(reg_map(x2))
        x = add([x1, x2, x3])

    recep_block = model_pe.get_layer('rBlock%d' % num_blocks)
    sep_conv = model_pe.get_layer('SepConv%d' % num_blocks)
    reg_map = model_pe.get_layer('RegMap%d' % num_blocks)
    x = recep_block(x)
    x = sep_conv(x)
    x = reg_map(x)

    model1 = Model(inp_pe, x, name='PoseReg')
    model1.trainable = full_trainable

    num_heatmaps = (num_context_per_joint + 1) * num_joints
    num_rows = K.int_shape(model1.output)[1]
    num_cols = K.int_shape(model1.output)[2]

    sams_input_shape = (num_frames, num_rows, num_cols, num_joints)
    samc_input_shape = \
            (num_frames, num_rows, num_cols, num_heatmaps - num_joints)

    # Build the time distributed models
    model_pe.get_layer('sSAM').trainable = full_trainable
    sam_s_model = TimeDistributed(model_pe.get_layer('sSAM'),
            input_shape=sams_input_shape, name='sSAM')

    if num_context_per_joint > 0:
        model_pe.get_layer('cSAM').trainable = full_trainable
        sam_c_model = TimeDistributed(model_pe.get_layer('cSAM'),
                input_shape=samc_input_shape, name='cSAM')

    model_pe.get_layer('sjProb').trainable = False
    jprob_s_model = TimeDistributed(model_pe.get_layer('sjProb'),
            input_shape=sams_input_shape, name='sjProb')

    if num_context_per_joint > 0:
        model_pe.get_layer('cjProb').trainable = False
        jprob_c_model = TimeDistributed(model_pe.get_layer('cjProb'),
                input_shape=samc_input_shape, name='cjProb')

    agg_model = build_context_aggregation(num_joints,
            num_context_per_joint, 0.8, num_frames=num_frames, name='Agg')

    h = TimeDistributed(model1, name='td_Model1')(xb1)
    if num_context_per_joint > 0:
        hs = Lambda(lambda x: x[:,:,:,:, :num_joints])(h)
        hc = Lambda(lambda x: x[:,:,:,:, num_joints:])(h)
    else:
        hs = h

    ys = sam_s_model(hs)
    if num_context_per_joint > 0:
        yc = sam_c_model(hc)
        pc = jprob_c_model(hc)
        y = agg_model([ys, yc, pc])
    else:
        y = ys

    p = jprob_s_model(Lambda(lambda x: 4*x)(hs))

    hs = TimeDistributed(Activation(channel_softmax_2d()),
            name='td_ChannelSoftmax')(hs)

    return y, p, hs, xb1


def _get_3d_pose_estimation_from_model(inp, model_pe, num_joints, num_blocks,
        depth_maps, full_trainable=False):

    num_frames = K.int_shape(inp)[1]

    model_pe.summary()

    stem = model_pe.get_layer('Stem')
    stem.trainable = full_trainable

    i = 1
    recep_block = model_pe.get_layer('rBlock%d' % i)
    recep_block.trainable = full_trainable

    x1 = TimeDistributed(stem, name='td_%s' % stem.name)(inp)
    xb1 = TimeDistributed(recep_block, name='td_%s' % recep_block.name)(x1)

    inp_pe = Input(shape=K.int_shape(xb1)[2:])
    sep_conv = model_pe.get_layer('SepConv%d' % i)
    reg_map = model_pe.get_layer('RegMap%d' % i)
    fre_map = model_pe.get_layer('fReMap%d' % i)
    x2 = sep_conv(inp_pe)
    x3 = fre_map(reg_map(x2))
    x = add([inp_pe, x2, x3])

    for i in range(2, num_blocks):
        recep_block = model_pe.get_layer('rBlock%d' % i)
        sep_conv = model_pe.get_layer('SepConv%d' % i)
        reg_map = model_pe.get_layer('RegMap%d' % i)
        fre_map = model_pe.get_layer('fReMap%d' % i)
        x1 = recep_block(x)
        x2 = sep_conv(x1)
        x3 = fre_map(reg_map(x2))
        x = add([x1, x2, x3])

    recep_block = model_pe.get_layer('rBlock%d' % num_blocks)
    sep_conv = model_pe.get_layer('SepConv%d' % num_blocks)
    reg_map = model_pe.get_layer('RegMap%d' % num_blocks)
    x = recep_block(x)
    x = sep_conv(x)
    x = reg_map(x)

    model1 = Model(inp_pe, x, name='PoseReg')
    model1.trainable = full_trainable

    num_rows = K.int_shape(model1.output)[1]
    num_cols = K.int_shape(model1.output)[2]

    sams_input_shape = (num_frames, num_rows, num_cols, num_joints)
    samz_input_shape = (num_frames, depth_maps, num_joints)

    # Build the time distributed models
    model_pe.get_layer('sSAM').trainable = full_trainable
    sam_s_model = TimeDistributed(model_pe.get_layer('sSAM'),
            input_shape=sams_input_shape, name='sSAM')

    model_pe.get_layer('zSAM').trainable = full_trainable
    sam_z_model = TimeDistributed(model_pe.get_layer('zSAM'),
            input_shape=samz_input_shape, name='zSAM')

    h = TimeDistributed(model1, name='td_Model1')(xb1)
    assert K.int_shape(h)[-1] == depth_maps * num_joints

    def _reshape_heatmaps(x):
        x = K.expand_dims(x, axis=-1)
        x = K.reshape(x, (-1, K.int_shape(x)[1], K.int_shape(x)[2],
            K.int_shape(x)[3], depth_maps, num_joints))

        return x

    h = Lambda(_reshape_heatmaps)(h)
    hxy = Lambda(lambda x: K.mean(x, axis=4))(h)
    hz = Lambda(lambda x: K.mean(x, axis=(2, 3)))(h)

    pxy = sam_s_model(hxy)
    pz = sam_z_model(hz)
    pose = concatenate([pxy, pz])

    vxy = TimeDistributed(GlobalMaxPooling2D(), name='td_GlobalMaxPooling2D',
            input_shape=K.int_shape(hxy)[1:])(hxy)
    vz = TimeDistributed(GlobalMaxPooling1D(), name='td_GlobalMaxPooling1D',
            input_shape=K.int_shape(hz)[1:])(hz)
    v = add([vxy, vz])
    v = Lambda(lambda x: 2*K.expand_dims(x, axis=-1))(v)
    visible = Activation('sigmoid')(v)

    hxy = TimeDistributed(Activation(channel_softmax_2d()),
            name='td_ChannelSoftmax')(hxy)

    return pose, visible, hxy, xb1


def build_guided_visual_model(model_pe, num_actions, input_shape, num_frames,
        num_joints, num_blocks, num_context_per_joint=2):

    inp = Input(shape=(num_frames,) + input_shape)
    _, _, hs, xb1 = _get_2d_pose_estimation_from_model(inp, model_pe, num_joints,
            num_blocks, num_context_per_joint,
            num_context_per_joint=num_context_per_joint)

    f = kronecker_prod(hs, xb1)
    num_features = K.int_shape(f)[-1]
    model_ar = build_visual_model(num_joints, num_actions, num_features,
            num_temp_frames=num_frames, name='GuidedVisAR')

    x = model_ar(f)
    model = Model(inp, x)

    return model


def build_merge_model(model_pe,
        num_actions,
        input_shape,
        num_frames,
        num_joints,
        num_blocks,
        pose_dim=2,
        depth_maps=8,
        num_context_per_joint=2,
        pose_net_version='v1',
        output_poses=False,
        weighted_merge=True,
        ar_pose_weights=None,
        ar_visual_weights=None,
        full_trainable=False):

    inp = Input(shape=(num_frames,) + input_shape)
    outputs = []

    if pose_dim == 2:
        y, p, hs, xb1 = _get_2d_pose_estimation_from_model(inp, model_pe,
                num_joints, num_blocks, num_context_per_joint,
                full_trainable=full_trainable)
    elif pose_dim == 3:
        y, p, hs, xb1 = _get_3d_pose_estimation_from_model(inp, model_pe,
                num_joints, num_blocks, depth_maps,
                full_trainable=full_trainable)

    if output_poses:
        outputs.append(y)
        outputs.append(p)

    model_pose = build_pose_model(num_joints, num_actions, num_frames,
            pose_dim=pose_dim, include_top=False, name='PoseAR',
            network_version=pose_net_version)
    # model_pose.trainable = False
    if ar_pose_weights is not None:
        model_pose.load_weights(ar_pose_weights)
    out_pose = model_pose([y, p])

    f = kronecker_prod(hs, xb1)
    num_features = K.int_shape(f)[-1]
    model_vis = build_visual_model(num_joints, num_actions, num_features,
            num_temp_frames=num_frames, include_top=False, name='GuidedVisAR')
    # model_vis.trainable = False
    if ar_visual_weights is not None:
        model_vis.load_weights(ar_visual_weights)
    out_vis = model_vis(f)

    for i in range(len(out_pose)):
        outputs.append(action_top(out_pose[i], name='p%d' % (i+1)))

    for i in range(len(out_vis)):
        outputs.append(action_top(out_vis[i], name='v%d' % (i+1)))

    p = out_pose[-1]
    v = out_vis[-1]

    def _heatmap_weighting(inp):
        num_filters = K.int_shape(inp)[-1]
        conv = SeparableConv2D(num_filters, (1, 1),
                use_bias=False)
        x = conv(inp)
        w = conv.get_weights()
        w[0].fill(1.)
        w[1].fill(0)
        for i in range(num_filters):
            w[1][0, 0, i, i] = 1.
        conv.set_weights(w)

        return x

    if weighted_merge:
        p = _heatmap_weighting(p)
        v = _heatmap_weighting(v)

    m = add([p, v])
    outputs.append(action_top(m, name='m'))

    model = Model(inp, outputs)

    return model


def compile(model, lr=0.001, momentum=0.95, loss_weights=None,
        pose_predicted=False):

    if pose_predicted:
        losses = []
        losses.append(elasticnet_loss_on_valid_joints)
        losses.append('binary_crossentropy')
        for i in range(len(model.output) - 2):
            losses.append('categorical_crossentropy')

        model.compile(loss=losses,
                optimizer=SGD(lr=lr, momentum=momentum, nesterov=True),
                loss_weights=loss_weights)
    else:
        model.compile(loss='categorical_crossentropy',
                optimizer=SGD(lr=lr, momentum=momentum, nesterov=True),
                metrics=['acc'], loss_weights=loss_weights)



In [None]:
def conv_block(inp, kernel_size, filters, last_act=True):

    filters1, filters2, filters3 = filters

    x = conv_bn_act(inp, filters1, (1, 1))
    x = conv_bn_act(x, filters2, kernel_size)
    x = conv_bn(x, filters3, (1, 1))

    shortcut = conv_bn(inp, filters3, (1, 1))
    x = add([x, shortcut])
    if last_act:
        x = Activation('relu')(x)

    return x

def identity_block(inp, kernel_size, filters, last_act=True):

    filters1, filters2, filters3 = filters

    x = conv_bn_act(inp, filters1, (1, 1))
    x = conv_bn_act(x, filters2, kernel_size)
    x = conv_bn(x, filters3, (1, 1))

    x = add([x, inp])
    if last_act:
        x = Activation('relu')(x)

    return x


def stem_inception_v4(x, image_div=8):
    """Entry-flow network (stem) *based* on Inception_v4."""

    assert image_div in [4, 8, 16, 32], \
            'Invalid image_div ({}).'.format(image_div)

    x = conv_bn_act(x, 32, (3, 3), strides=(2, 2))
    x = conv_bn_act(x, 32, (3, 3))
    if image_div is 32:
        x = MaxPooling2D((2, 2))(x)
    x = conv_bn_act(x, 64, (3, 3))

    a = conv_bn_act(x, 96, (3, 3), strides=(2, 2))
    b = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
    x = concatenate([a, b])

    a = conv_bn_act(x, 64, (1, 1))
    a = conv(a, 96, (3, 3))
    b = conv_bn_act(x, 64, (1, 1))
    b = conv_bn_act(b, 64, (5, 1))
    b = conv_bn_act(b, 64, (1, 5))
    b = conv(b, 96, (3, 3))
    x = concatenate([a, b])
    x = BatchNormalization(axis=-1, scale=False)(x)

    if image_div != 4:
        a = act_conv_bn(x, 192, (3, 3), strides=(2, 2))
        b = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
        x = concatenate([a, b])

    if image_div in [16, 32]:
        a = act_conv_bn(x, 192, (3, 3), strides=(2, 2))
        b = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
        x = concatenate([a, b])

    if image_div ==4:
        x = residual(x, int_size=112, out_size=2*192+64, convtype='normal',
                name='residual0')
    else:
        x = residual(x, int_size=144, out_size=3*192, convtype='normal',
                name='residual0')

    return x


def stem_residual_eccv(x, image_div=8):
    """Entry-flow network (stem) *based* on ResNet ('residual' option)."""

    assert image_div in [4, 8, 16, 32], \
            'Invalid image_div ({}).'.format(image_div)

    x = conv_bn_act(x, 64, (7, 7), strides=(2, 2), padding='same')
    a = conv_bn_act(x, 128, (3, 3), padding='same')
    b = conv_bn_act(x, 128, (1, 1), padding='same')
    x = add([a, b])
    x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
    x = residual(x, int_size=128, out_size=256, convtype='normal', name='rn0')
    x = residual(x, int_size=128, out_size=256, convtype='normal', name='rn1')

    if image_div is 4:
        x = residual(x, out_size=256, convtype='normal', name='rn3')

    else:
        x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
        x = residual(x, int_size=192, out_size=384, convtype='normal',
                name='rn3')
        x = residual(x, int_size=192, out_size=384, convtype='normal',
                name='rn4')

        if image_div in [16, 32]:
            x = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
            x = residual(x, int_size=256, out_size=512, convtype='normal',
                    name='rn5')
            x = residual(x, int_size=256, out_size=512, convtype='normal',
                    name='rn6')

            if image_div is 32:
                x = MaxPooling2D((2, 2), strides=(2, 2), padding='same')(x)

    return x



def reception_block(x, num_levels, kernel_size, int_size=None,
        convtype='depthwise', name=None):

    def hourglass(x, n):
        up1 = residual(x, kernel_size=kernel_size, int_size=int_size,
                convtype=convtype)

        low = MaxPooling2D((2, 2))(x)

        if n == num_levels:
            low = act_conv_bn(low, int(K.int_shape(x)[-1] / 2), (1, 1))
        low = residual(low, kernel_size=kernel_size, int_size=int_size,
                convtype=convtype)

        if n > 2:
            low = hourglass(low, n-1)
        else:
            low = residual(low, kernel_size=kernel_size,
                    int_size=int_size,
                    convtype=convtype)

        if n == num_levels:
            low = residual(low, kernel_size=kernel_size,
                    out_size=K.int_shape(x)[-1], int_size=int_size,
                    convtype=convtype)
        else:
            low = residual(low, kernel_size=kernel_size,
                    int_size=int_size, convtype=convtype)

        up2 = UpSampling2D((2, 2))(low)

        x = add([up1, up2])

        return x

    x = hourglass(x, num_levels)

    return x


def build_keypoints_regressor(input_shape, dim, num_maps, sam_model, prob_model,
        name=None, verbose=0):

    assert num_maps >= 1, \
            'The number of maps should be at least 1 (%d given)' % num_maps

    inputs = []
    inputs3d = []
    p_concat = []
    v_concat = []

    # Auxiliary functions
    v_tile = Lambda(lambda x: K.tile(x, (1, 1, dim)))
    # This depends on TensorFlow because keras does not implement divide.
    tf_div = Lambda(lambda x: tf.divide(x[0], x[1]))

    for i in range(num_maps):
        h = Input(shape=input_shape)
        inputs.append(h)
        h_s = act_channel_softmax(h)
        p = sam_model(h_s)
        v = prob_model(h_s)

        if dim == 3:
            d = Input(shape=input_shape)
            inputs3d.append(d)
            d_s = Activation('sigmoid')(d)
            dm = multiply([d_s, h_s])
            z = Lambda(lambda x: K.sum(x, axis=(1, 2)))(dm)
            z = Lambda(lambda x: K.expand_dims(x, axis=-1))(z)
            p = concatenate([p, z])

        if num_maps > 1:
            t = v_tile(v)
            p = multiply([p, v_tile(v)])

        p_concat.append(p)
        v_concat.append(v)

    if num_maps > 1:
        p = add(p_concat)
        v_sum = add(v_concat)
        p = tf_div([p, v_tile(v_sum)])
        v = maximum(v_concat)
    else:
        p = p_concat[0]
        v = v_concat[0]

    model = Model(inputs+inputs3d, [p, v], name=name)
    if verbose:
        model.summary()

    return model





def build_softargmax_1d(input_shape, name=None):

    if name is None:
        name_sm = None
    else:
        name_sm = name + '_softmax'

    inp = Input(shape=input_shape)
    x = act_depth_softmax(inp, name=name_sm)

    x = lin_interpolation_1d(x)

    model = Model(inputs=inp, outputs=x, name=name)
    model.trainable = False

    return model


def build_softargmax_2d(input_shape, rho=0., name=None):

    if name is None:
        name_sm = None
    else:
        name_sm = name + '_softmax'

    inp = Input(shape=input_shape)
    x = act_channel_softmax(inp, name=name_sm)
    if rho > 0:
        x = kl_divergence_regularizer(x, rho=rho)

    x_x = lin_interpolation_2d(x, axis=0)
    x_y = lin_interpolation_2d(x, axis=1)
    x = concatenate([x_x, x_y])

    model = Model(inputs=inp, outputs=x, name=name)
    model.trainable = False

    return model


def build_joints_probability(input_shape, name=None, verbose=0):

    inp = Input(shape=input_shape)

    x = inp
    x = AveragePooling2D((2, 2), strides=(1, 1))(x)
    x = Lambda(lambda x: 4*x)(x)
    x = GlobalMaxPooling2D()(x)

    x = Lambda(lambda x: K.expand_dims(x, axis=-1))(x)

    model = Model(inputs=inp, outputs=x, name=name)
    if verbose:
        model.summary()

    return model

In [None]:


def concat_tensorlist(t):
    assert isinstance(t, list), 't should be a list, got ({})'.format(t)

    if len(t) > 1:
        return concatenate(t)
    return t[0]


def add_tensorlist(t):
    assert isinstance(t, list), 't should be a list, got ({})'.format(t)

    if len(t) > 1:
        return add(t)
    return t[0]


def residual_unit(x, kernel_size, strides=(1, 1), out_size=None,
        convtype='depthwise', shortcut_act=True,
        features_div=2, name=None):
    """(Separable) Residual Unit implementation.
    """
    assert convtype in ['depthwise', 'normal'], \
            'Invalid convtype ({}).'.format(convtype)

    num_filters = K.int_shape(x)[-1]
    if out_size is None:
        out_size = num_filters

    skip_conv = (num_filters != out_size) or (strides != (1, 1))

    if skip_conv:
        x = BatchNormalization(name=appstr(name, '_bn1'))(x)

    shortcut = x
    if skip_conv:
        if shortcut_act:
            shortcut = relu(shortcut, name=appstr(name, '_shortcut_act'))
        shortcut = conv2d(shortcut, out_size, (1, 1), strides=strides,
                name=appstr(name, '_shortcut_conv'))

    if not skip_conv:
        x = BatchNormalization(name=appstr(name, '_bn1'))(x)
    x = relu(x, name=appstr(name, '_act1'))

    if convtype == 'depthwise':
        x = sepconv2d(x, out_size, kernel_size, strides=strides,
                name=appstr(name, '_conv1'))
    else:
        x = conv2d(x, int(out_size / features_div), (1, 1),
                name=appstr(name, '_conv1'))
        middle_bn_name = appstr(name, '_bn2')
        x = BatchNormalization(name=middle_bn_name)(x)
        x = relu(x, name=appstr(name, '_act2'))
        x = conv2d(x, out_size, kernel_size, strides=strides,
                name=appstr(name, '_conv2'))

    x = add([shortcut, x])

    return x


def downscaling_unit(x, cfg, out_size=None, name=None):
    """Downscaling Unit using depth wise separable convolutions"""

    kernel_size = cfg.kernel_size
    downsampling_type = cfg.downsampling_type

    if out_size is None:
        out_size = K.int_shape(x)[-1]

    s1 = (2, 2) if downsampling_type == 'conv' else (1, 1)
    if downsampling_type == 'maxpooling':
        x = maxpooling2d(x, (2, 2))

    x = residual_unit(x, kernel_size, out_size=out_size, strides=s1,
            name=appstr(name, '_r0'))

    return x


def upscaling_unit(x, cfg, out_size=None, name=None):
    """Upscaling Unit using depth wise separable convolutions"""

    kernel_size = cfg.kernel_size
    downsampling_type = cfg.downsampling_type

    if out_size is None:
        out_size = K.int_shape(x)[-1]

    if downsampling_type == 'maxpooling':
        x = upsampling2d(x, (2, 2))
        x = residual_unit(x, kernel_size, out_size=out_size,
                name=appstr(name, '_r0'))
    else:
        x = BatchNormalization(name=appstr(name, '_bn1'))(x)
        x = relu(x, name=appstr(name, '_act1'))
        x = conv2dtranspose(x, out_size, (2, 2), strides=(2, 2),
                name=appstr(name, '_convtrans1'))

    return x


def set_trainable_layers(model, keyword, pos_trainable, neg_trainable=None):

    def trainable_flag(curr, newval):
        return newval if newval in [True, False] else curr

    for i in range(len(model.layers)):
        name = model.layers[i].name
        if '_xy_x' in name or '_xy_y' in name \
                or '_xy2_x' in name or '_xy2_y' in name:
            warning('Unchanged layer {}'.format(name))
            continue

        if keyword in name:
            model.layers[i].trainable = \
                    trainable_flag(model.layers[i].trainable, pos_trainable)
        else:
            model.layers[i].trainable = \
                    trainable_flag(model.layers[i].trainable, neg_trainable)


def copy_replica_layers(model):
    for i in range(len(model.layers)):
        if '_replica' in model.layers[i].name:
            rname = model.layers[i].name
            lname = rname.split('_replica')[0]
            worg = model.get_layer(lname).get_weights()
            wrep = model.get_layer(rname).get_weights()
            wrep[0][:] = worg[0][:]
            model.get_layer(rname).set_weights(wrep)


def compile_model(model, loss, optimizer, loss_weights=None):

    nout = len(model.outputs)
    if loss_weights is not None:
        if isinstance(loss_weights, list):
            assert len(loss_weights) == nout, \
                    'loss_weights incompatible with model'
        else:
            loss_weights = nout*[loss_weights]

    if isinstance(loss, list):
        assert nout == len(loss), 'loss not corresponding to the model outputs'

    model.compile(loss=loss, optimizer=optimizer, loss_weights=loss_weights)


# Aliases.
residual = residual_unit
downscaling = downscaling_unit
upscaling = upscaling_unit

In [None]:
# -*- coding: utf-8 -*-
"""Define the ReceptionNet for human pose estimation for Keras and TensorFlow.

The network is defined as:

-------   ------
|Input|-->|Stem|--> [...],
-------   ------

end every block:

                     -----------------------------------------------
                     |             --------------------------------|
           --------- |  ---------- |  ---------      ---------     |
    [...]->|rBlockN|--->|SepConvN|--->|RegMapN|-(H)->|fReMapN|--->(+)-->[...]
           ---------    ----------    ---------      ---------

For dim = 2 (2D poses):

                  |-->(sSAM)-------------------
         |--(Hs)--|                           |
         |        |-->(sjProp)--> *visible*   |
    H -> |                                    |
         |        |-->(cSAM)----------------(Agg)--> *pose*
         |--(Hc)--|                           |
                  |-->(cjProp)----------------|
"""


def _sepconv_residual(x, out_size, name, kernel_size=(3, 3)):
    shortcut_name = name + '_shortcut'
    reduce_name = name + '_reduce'

    num_filters = K.int_shape(x)[-1]
    if num_filters == out_size:
        ident = x
    else:
        ident = act_conv_bn(x, out_size, (1, 1), name=shortcut_name)

    if out_size < num_filters:
        x = act_conv_bn(x, out_size, (1, 1), name=reduce_name)

    x = separable_act_conv_bn(x, out_size, kernel_size, name=name)
    x = add([ident, x])

    return x

def _stem(inp, old_model=False):

    xi = Input(shape=K.int_shape(inp)[1:]) # 256 x 256 x 3

    x = conv_bn_act(xi, 32, (3, 3), strides=(2, 2))
    if not old_model:
        x = conv_bn_act(x, 32, (3, 3))
    x = conv_bn_act(x, 64, (3, 3))

    if old_model:
        a = conv_bn_act(x, 32, (3, 3), strides=(2, 2))
    else:
        a = conv_bn_act(x, 96, (3, 3), strides=(2, 2))
    b = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
    x = concatenate([a, b])

    a = conv_bn_act(x, 64, (1, 1))
    a = conv_bn(a, 96, (3, 3))
    b = conv_bn_act(x, 64, (1, 1))
    b = conv_bn_act(b, 64, (5, 1))
    b = conv_bn_act(b, 64, (1, 5))
    b = conv_bn(b, 96, (3, 3))
    x = concatenate([a, b])

    a = act_conv_bn(x, 192, (3, 3), strides=(2, 2))
    b = MaxPooling2D((2, 2), strides=(2, 2))(x)
    x = concatenate([a, b])

    if not old_model:
        x = _sepconv_residual(x, 3*192, name='sepconv1')

    model = Model(xi, x, name='Stem')
    x = model(inp)

    if old_model:
        x = _sepconv_residual(x, 512, name='sepconv1')

    return x


def build_reception_block(inp, name, ksize=(3, 3)):
    input_shape = K.int_shape(inp)[1:]
    size = input_shape[-1]

    xi = Input(shape=input_shape)
    a = _sepconv_residual(xi, size, name='sepconv_l1', kernel_size=ksize)

    low1 = MaxPooling2D((2, 2))(xi)
    low1 = act_conv_bn(low1, int(size/2), (1, 1))
    low1 = _sepconv_residual(low1, int(size/2), name='sepconv_l2_1',
            kernel_size=ksize)
    b = _sepconv_residual(low1, int(size/2), name='sepconv_l2_2',
            kernel_size=ksize)

    c = MaxPooling2D((2, 2))(low1)
    c = _sepconv_residual(c, int(size/2), name='sepconv_l3_1',
            kernel_size=ksize)
    c = _sepconv_residual(c, int(size/2), name='sepconv_l3_2',
            kernel_size=ksize)
    c = _sepconv_residual(c, int(size/2), name='sepconv_l3_3',
            kernel_size=ksize)
    c = UpSampling2D((2, 2))(c)

    b = add([b, c])
    b = _sepconv_residual(b, size, name='sepconv_l2_3', kernel_size=ksize)
    b = UpSampling2D((2, 2))(b)
    x = add([a, b])

    model = Model(inputs=xi, outputs=x, name=name)

    return model(inp)


def build_sconv_block(inp, name=None, ksize=(3, 3)):
    input_shape = K.int_shape(inp)[1:]

    xi = Input(shape=input_shape)
    x = separable_act_conv_bn(xi, input_shape[-1], ksize)

    model = Model(inputs=xi, outputs=x, name=name)

    return model(inp)


def build_regmap_block(inp, num_maps, name=None):
    input_shape = K.int_shape(inp)[1:]

    xi = Input(shape=input_shape)
    x = act_conv(xi, num_maps, (1, 1))

    model = Model(inputs=xi, outputs=x, name=name)

    return model(inp)


def build_fremap_block(inp, num_filters, name=None):
    input_shape = K.int_shape(inp)[1:]

    xi = Input(shape=input_shape)
    x = act_conv_bn(xi, num_filters, (1, 1))

    model = Model(inputs=xi, outputs=x, name=name)

    return model(inp)


def pose_regression_2d_context(h, num_joints, sam_s_model,
        sam_c_model, jprob_c_model, agg_model, jprob_s_model):

    # Split heatmaps for specialized and contextual information
    hs = Lambda(lambda x: x[:,:,:,:num_joints])(h)
    hc = Lambda(lambda x: x[:,:,:,num_joints:])(h)

    # Soft-argmax and joint probability for each heatmap
    ps = sam_s_model(hs)
    pc = sam_c_model(hc)
    vc = jprob_c_model(hc)

    pose = agg_model([ps, pc, vc])
    visible = jprob_s_model(hs)

    return pose, visible, hs


def pose_regression_2d(h, sam_s_model, jprob_s_model):

    pose = sam_s_model(h)
    visible = jprob_s_model(h)

    return pose, visible, h


def pose_regression_3d(h, num_joints, depth_maps, sam_s_model, sam_z_model):
    assert K.int_shape(h)[-1] == depth_maps * num_joints

    def _reshape_heatmaps(x):
        x = K.expand_dims(x, axis=-1)
        x = K.reshape(x, (-1, K.int_shape(x)[1], K.int_shape(x)[2],
            depth_maps, num_joints))

        return x

    h = Lambda(_reshape_heatmaps)(h)
    hxy = Lambda(lambda x: K.mean(x, axis=3))(h)
    hz = Lambda(lambda x: K.mean(x, axis=(1, 2)))(h)
    pxy = sam_s_model(hxy)
    pz = sam_z_model(hz)
    pose = concatenate([pxy, pz])

    vxy = GlobalMaxPooling2D()(hxy)
    vz = GlobalMaxPooling1D()(hz)
    v = add([vxy, vz])
    v = Lambda(lambda x: K.expand_dims(x, axis=-1))(v)
    visible = Activation('sigmoid')(v)

    return pose, visible, hxy


def build(input_shape, num_joints, dim,
        num_context_per_joint=None,
        alpha=0.8,
        num_blocks=4,
        depth_maps=16,
        ksize=(3, 3),
        export_heatmaps=False,
        export_vfeat_block=None,
        old_model=False,
        concat_pose_confidence=True):

    if dim == 2:
        if num_context_per_joint is None:
            num_context_per_joint = 2

        num_heatmaps = (num_context_per_joint + 1) * num_joints

    elif dim == 3:
        assert num_context_per_joint == None, \
                'For 3D pose estimation, contextual heat maps are not allowed.'
        num_heatmaps = depth_maps * num_joints
    else:
        raise ValueError('"dim" must be 2 or 3 and not (%d)' % dim)

    inp = Input(shape=input_shape)
    outputs = []
    vfeat = None

    x = _stem(inp, old_model=old_model)

    num_rows, num_cols, num_filters = K.int_shape(x)[1:]

    # Build the soft-argmax models (no parameters) for specialized and
    # contextual maps.
    sams_input_shape = (num_rows, num_cols, num_joints)
    sam_s_model = build_softargmax_2d(sams_input_shape, rho=0, name='sSAM')
    jprob_s_model = build_joints_probability(sams_input_shape, name='sjProb')

    # Build the aggregation model (no parameters)
    if num_context_per_joint is not None:
        samc_input_shape = (num_rows, num_cols, num_heatmaps - num_joints)
        sam_c_model = build_softargmax_2d(samc_input_shape, rho=0,
                name='cSAM')
        jprob_c_model = build_joints_probability(samc_input_shape,
                name='cjProb')
        agg_model = build_context_aggregation(num_joints,
                num_context_per_joint, alpha, name='Agg')

    if dim == 3:
        samz_input_shape = (depth_maps, num_joints)
        sam_z_model = build_softargmax_1d(samz_input_shape, name='zSAM')

    for bidx in range(num_blocks):
        block_shape = K.int_shape(x)[1:]
        x = build_reception_block(x, name='rBlock%d' % (bidx + 1), ksize=ksize)

        if export_vfeat_block == (bidx+1):
            vfeat = x

        ident_map = x
        x = build_sconv_block(x, name='SepConv%d' % (bidx + 1), ksize=ksize)
        h = build_regmap_block(x, num_heatmaps, name='RegMap%d' % (bidx + 1))

        if dim == 2:
            if num_context_per_joint is not None:
                pose, visible, hm = pose_regression_2d_context(h, num_joints,
                        sam_s_model, sam_c_model, jprob_c_model, agg_model,
                        jprob_s_model)
            else:
                pose, visible, hm = pose_regression_2d(h, sam_s_model,
                        jprob_s_model)
        else:
            pose, visible, hm = pose_regression_3d(h, num_joints, depth_maps,
                    sam_s_model, sam_z_model)

        if concat_pose_confidence:
            outputs.append(concatenate([pose, visible]))
        else:
            outputs.append(pose)
            outputs.append(visible)

        if export_heatmaps:
            outputs.append(hm)

        if bidx < num_blocks - 1:
            h = build_fremap_block(h, block_shape[-1],
                    name='fReMap%d' % (bidx + 1))
            x = add([ident_map, x, h])

    if vfeat is not None:
        outputs.append(vfeat)

    model = Model(inputs=inp, outputs=outputs)

    return model


def compile(model, ptr, vtr, num_y_per_branch=1):
    """Create a list with ground truth, loss functions and loss weights.
    """
    yholder_tr = []
    losses = []
    loss_weights = []
    num_blocks = int(len(model.output) / (num_y_per_branch + 1))

    printcn(OKBLUE,
            'Compiling model with %d outputs per branch and %d branches.' %
            (num_y_per_branch, num_blocks))

    for i in range(num_blocks):
        for j in range(num_y_per_branch):
            yholder_tr.append(ptr)
            losses.append(elasticnet_loss_on_valid_joints)
            loss_weights.append(1.)
        yholder_tr.append(vtr)
        losses.append('binary_crossentropy')
        loss_weights.append(0.01)

    printcn(OKBLUE, 'loss_weights: ' + str(loss_weights))
    model.compile(loss=losses, optimizer=tf.keras.optimizers.legacy.RMSprop, loss_weights=loss_weights)

    return yholder_tr

# **General filing**

In [None]:

class SaveModel(Callback):

    def __init__(self, filepath, model_to_save=None, save_best_only=False,
            callback_to_monitor=None, verbose=1):

        if save_best_only and callback_to_monitor is None:
            warning('Cannot save the best model with no callback monitor')

        self.filepath = filepath
        self.model_to_save = model_to_save
        self.save_best_only = save_best_only
        self.callback_to_monitor = callback_to_monitor
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs=None):
        if self.model_to_save is not None:
            model = self.model_to_save
        else:
            model = self.model

        filename = self.filepath.format(epoch=epoch + 1)

        if self.best_epoch == epoch + 1 or not self.save_best_only:
            if self.verbose:
                printnl('Saving model @epoch=%05d to %s' \
                        % (epoch + 1, filename))
            model.save_weights(filename)

    @property
    def best_epoch(self):
        if self.callback_to_monitor is not None:
            return self.callback_to_monitor.best_epoch
        else:
            return None

In [None]:


class DataConfig(object):
    """Input frame configuration and data augmentation setup."""

    def __init__(self, crop_resolution=(256, 256), image_channels=(3,),
            angles=[0], fixed_angle=0,
            scales=[1], fixed_scale=1,
            trans_x=[0], fixed_trans_x=0,
            trans_y=[0], fixed_trans_y=0,
            hflips=[0, 1], fixed_hflip=0,
            chpower=0.01*np.array(range(90, 110+1, 2)), fixed_chpower=1,
            geoocclusion=None, fixed_geoocclusion=None,
            subsampling=[1], fixed_subsampling=1):

        self.crop_resolution = crop_resolution
        self.image_channels = image_channels
        if K.image_data_format() == 'channels_last':
            self.input_shape = crop_resolution + image_channels
        else:
            self.input_shape = image_channels + crop_resolution
        self.angles = angles
        self.fixed_angle = fixed_angle
        self.scales = scales
        self.fixed_scale = fixed_scale
        self.trans_x = trans_x
        self.trans_y = trans_y
        self.fixed_trans_x = fixed_trans_x
        self.fixed_trans_y = fixed_trans_y
        self.hflips = hflips
        self.fixed_hflip = fixed_hflip
        self.chpower = chpower
        self.fixed_chpower = fixed_chpower
        self.geoocclusion = geoocclusion
        self.fixed_geoocclusion = fixed_geoocclusion
        self.subsampling = subsampling
        self.fixed_subsampling = fixed_subsampling

    def get_fixed_config(self):
        return {'angle': self.fixed_angle,
                'scale': self.fixed_scale,
                'transx': self.fixed_trans_x,
                'transy': self.fixed_trans_y,
                'hflip': self.fixed_hflip,
                'chpower': self.fixed_chpower,
                'geoocclusion': self.fixed_geoocclusion,
                'subspl': self.fixed_subsampling}

    def random_data_generator(self):
        angle = DataConfig._getrand(self.angles)
        scale = DataConfig._getrand(self.scales)
        trans_x = DataConfig._getrand(self.trans_x)
        trans_y = DataConfig._getrand(self.trans_y)
        hflip = DataConfig._getrand(self.hflips)
        chpower = (DataConfig._getrand(self.chpower),
                DataConfig._getrand(self.chpower),
                DataConfig._getrand(self.chpower))
        geoocclusion = self.__get_random_geoocclusion()
        subsampling = DataConfig._getrand(self.subsampling)

        return {'angle': angle,
                'scale': scale,
                'transx': trans_x,
                'transy': trans_y,
                'hflip': hflip,
                'chpower': chpower,
                'geoocclusion': geoocclusion,
                'subspl': subsampling}

    def __get_random_geoocclusion(self):
        if self.geoocclusion is not None:

            w = int(DataConfig._getrand(self.geoocclusion) / 2)
            h = int(DataConfig._getrand(self.geoocclusion) / 2)
            xmin = w + 1
            xmax = self.crop_resolution[0] - xmin
            ymin = h + 1
            ymax = self.crop_resolution[1] - ymin

            x = DataConfig._getrand(range(xmin, xmax, 5))
            y = DataConfig._getrand(range(ymin, ymax, 5))
            bbox = (x-w, y-h, x+w, y+h)

            return bbox

        else:
            return None

    @staticmethod
    def _getrand(x):
        return x[np.random.randint(0, len(x))]


# Data generation and configuration setup

mpii_sp_dataconf = DataConfig(
        crop_resolution=(256, 256),
        angles=np.array(range(-40, 40+1, 5)),
        scales=np.array([0.7, 1., 1.3]),
        )

pennaction_dataconf = DataConfig(
        crop_resolution=(256, 256),
        angles=np.array(range(-30, 30+1, 5)),
        scales=np.array([0.7, 1.0, 1.3]),
        trans_x=np.array(range(-40, 40+1, 5)),
        trans_y=np.array(range(-10, 10+1, 5)),
        subsampling=[4, 6, 8],
        fixed_subsampling=6
        )

pennaction_pe_dataconf = DataConfig(
        crop_resolution=(256, 256),
        angles=np.array(range(-40, 40+1, 5)),
        scales=np.array([0.7, 1.0, 1.3, 2.0]),
        trans_x=np.array(range(-40, 40+1, 5)),
        trans_y=np.array(range(-10, 10+1, 5)),
        )

human36m_dataconf = DataConfig(
        crop_resolution=(256, 256),
        angles=np.array(range(-10, 10+1, 5)),
        scales=np.array([0.8, 1.0, 1.2]),
        trans_x=np.array(range(-20, 20+1, 5)),
        trans_y=np.array(range(-4, 4+1, 1)),
        geoocclusion=np.array(range(20, 90)),
        )

ntu_dataconf = DataConfig(
        crop_resolution=(256, 256),
        angles=[0],
        scales=np.array([0.7, 1.0, 1.3]),
        trans_x=range(-40, 40+1, 5),
        trans_y=range(-10, 10+1, 5),
        subsampling=[3, 4, 5],
        fixed_subsampling=4
        )

ntu_pe_dataconf = DataConfig(
        crop_resolution=(256, 256),
        angles=np.array(range(-10, 10+1, 5)),
        scales=np.array([0.7, 1.0, 1.3, 2.0]),
        trans_x=np.array(range(-40, 40+1, 5)),
        trans_y=np.array(range(-10, 10+1, 5)),
        )

class ModelConfig(object):
    """Hyperparameters for models."""

    def __init__(self, input_shape, poselayout,
            num_actions=[],
            num_pyramids=8,
            action_pyramids=[1, 2], # list of pyramids to perform AR
            num_levels=4,
            kernel_size=(5, 5),
            growth=96,
            image_div=8,
            predict_rootz=False,
            downsampling_type='maxpooling',
            pose_replica=False,
            num_pose_features=128,
            num_visual_features=128,
            sam_alpha=1,
            dbg_decoupled_pose=False,
            dbg_decoupled_h=False):

        self.input_shape = input_shape
        self.num_joints = poselayout.num_joints
        self.dim = poselayout.dim

        assert type(num_actions) == list, 'num_actions should be a list'
        self.num_actions = num_actions

        self.num_pyramids = num_pyramids
        self.action_pyramids = action_pyramids
        self.num_levels = num_levels
        self.kernel_size = kernel_size
        self.growth = growth
        self.image_div = image_div
        self.predict_rootz = predict_rootz
        self.downsampling_type = downsampling_type
        self.pose_replica = pose_replica
        self.num_pose_features = num_pose_features
        self.num_visual_features = num_visual_features
        self.sam_alpha = sam_alpha

        """Debugging flags."""
        self.dbg_decoupled_pose = dbg_decoupled_pose
        self.dbg_decoupled_h = dbg_decoupled_h

# Aliases.
mpii_dataconf = mpii_sp_dataconf

In [None]:
def elasticnet_bincross_loss_on_valid_joints(y_true, y_pred):
    idx = K.cast(K.greater(y_true, 0.), 'float32')
    num_joints = K.clip(K.sum(idx, axis=(-1, -2)), 1, None)

    l1 = K.abs(y_pred - y_true)
    l2 = K.square(y_pred - y_true)
    bc = 0.01*K.binary_crossentropy(y_true, y_pred)
    dummy = 0. * y_pred

    return K.sum(tf.where(K.cast(idx, 'bool'), l1 + l2 + bc, dummy),
            axis=(-1, -2)) / num_joints


def l1_loss_on_valid_joints(y_true, y_pred):
    y_true, y_pred, num_joints = _reset_invalid_joints(y_true, y_pred)
    return K.sum(K.abs(y_pred - y_true), axis=(-1, -2)) / num_joints


def l2_loss_on_valid_joints(y_true, y_pred):
    y_true, y_pred, num_joints = _reset_invalid_joints(y_true, y_pred)
    return K.sum(K.square(y_pred - y_true), axis=(-1, -2)) / num_joints


def pose_regression_loss(pose_loss, visibility_weight):

    def _pose_regression_loss(y_true, y_pred):
        video_clip = K.ndim(y_true) == 4
        if video_clip:
            """The model was time-distributed, so there is one additional
            dimension.
            """
            p_true = y_true[:, :, :, 0:-1]
            p_pred = y_pred[:, :, :, 0:-1]
            v_true = y_true[:, :, :, -1]
            v_pred = y_pred[:, :, :, -1]
        else:
            p_true = y_true[:, :, 0:-1]
            p_pred = y_pred[:, :, 0:-1]
            v_true = y_true[:, :, -1]
            v_pred = y_pred[:, :, -1]

        if pose_loss == 'l1l2':
            ploss = elasticnet_loss_on_valid_joints(p_true, p_pred)
        elif pose_loss == 'l1':
            ploss = l1_loss_on_valid_joints(p_true, p_pred)
        elif pose_loss == 'l2':
            ploss = l2_loss_on_valid_joints(p_true, p_pred)
        elif pose_loss == 'l1l2bincross':
            ploss = elasticnet_bincross_loss_on_valid_joints(p_true, p_pred)
        else:
            raise Exception('Invalid pose_loss option ({})'.format(pose_loss))

        vloss = binary_crossentropy(v_true, v_pred)

        if video_clip:
            """If time-distributed, average the error on video frames."""
            vloss = K.mean(vloss, axis=-1)
            ploss = K.mean(ploss, axis=-1)

        return ploss + visibility_weight*vloss

    return _pose_regression_loss

In [None]:
def _norm(x, axis=None):
    return np.sqrt(np.sum(np.power(x, 2), axis=axis))

def _valid_joints(y, min_valid=-1e6):
    def and_all(x):
        if x.all():
            return 1
        return 0

    return np.apply_along_axis(and_all, axis=1, arr=(y > min_valid))

def mean_distance_error(y_true, y_pred):
    """Compute the mean distance error on predicted samples, considering
    only the valid joints from y_true.

    # Arguments
        y_true: [num_samples, nb_joints, dim]
        y_pred: [num_samples, nb_joints, dim]

    # Return
        The mean absolute error on valid joints.
    """

    assert y_true.shape == y_pred.shape
    num_samples = len(y_true)

    dist = np.zeros(y_true.shape[0:2])
    valid = np.zeros(y_true.shape[0:2])

    for i in range(num_samples):
        valid[i,:] = _valid_joints(y_true[i])
        dist[i,:] = _norm(y_true[i] - y_pred[i], axis=1)

    match = dist * valid
    # print ('Maximum valid distance: {}'.format(match.max()))
    # print ('Average valid distance: {}'.format(match.mean()))

    return match.sum() / valid.sum()

def pckh(y_true, y_pred, head_size, refp=0.5):
    """Compute the PCKh measure (using refp of the head size) on predicted
    samples.

    # Arguments
        y_true: [num_samples, nb_joints, 2]
        y_pred: [num_samples, nb_joints, 2]
        head_size: [num_samples, 1]

    # Return
        The PCKh score.
    """

    assert y_true.shape == y_pred.shape
    assert len(y_true) == len(head_size)
    num_samples = len(y_true)

    # Ignore the joints 6 and 7 (pelvis and thorax respectively), according
    # to the file 'annolist2matrix.m'
    used_joints = [2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 8, 9]
    y_true = y_true[:, used_joints, :]
    y_pred = y_pred[:, used_joints, :]
    dist = np.zeros((num_samples, len(used_joints)))
    valid = np.zeros((num_samples, len(used_joints)))

    for i in range(num_samples):
        valid[i,:] = _valid_joints(y_true[i])
        dist[i,:] = _norm(y_true[i] - y_pred[i], axis=1) / head_size[i]
    match = (dist <= refp) * valid

    return match.sum() / valid.sum()


def pck3d(y_true, y_pred, refp=150):
    """Compute the PCK3D measure (using refp as the threshold) on predicted
    samples.

    # Arguments
        y_true: [num_samples, nb_joints, 3]
        y_pred: [num_samples, nb_joints, 3]

    # Return
        The PCKh score.
    """

    assert y_true.shape == y_pred.shape
    num_samples = len(y_true)

    # Ignore the joints 6 and 7 (pelvis and thorax respectively), according
    # to the file 'annolist2matrix.m'
    used_joints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    y_true = y_true[:, used_joints, :]
    y_pred = y_pred[:, used_joints, :]
    dist = np.zeros((num_samples, len(used_joints)))
    valid = np.zeros((num_samples, len(used_joints)))

    for i in range(num_samples):
        valid[i,:] = _valid_joints(y_true[i])
        dist[i,:] = _norm(y_true[i] - y_pred[i], axis=1)
    match = (dist <= refp) * valid

    return match.sum() / valid.sum()


def pckh_per_joint(y_true, y_pred, head_size, pose_layout, refp=0.5, verbose=1):
    """Compute the PCKh measure (using refp of the head size) on predicted
    samples per joint and output the results.

    # Arguments
        y_true: [num_samples, nb_joints, 2]
        y_pred: [num_samples, nb_joints, 2]
        head_size: [num_samples, 1]
        pose_layout: from deephar.utils.pose
    """

    assert y_true.shape == y_pred.shape
    assert len(y_true) == len(head_size)

    num_samples = len(y_true)
    num_joints = pose_layout.num_joints
    dist = np.zeros((num_samples, num_joints))
    valid = np.zeros((num_samples, num_joints))

    for i in range(num_samples):
        valid[i,:] = _valid_joints(y_true[i])
        dist[i,:] = _norm(y_true[i] - y_pred[i], axis=1) / head_size[i]

    for j in range(num_joints):
        jname = pose_layout.joint_names[j]
        space = 7*' '
        ss = len(space) - len(jname)
        if verbose:
            printc(HEADER, jname + space[0:ss] + '| ')
    if verbose:
        print ('')

    match = (dist <= refp) * valid
    for j in range(num_joints):
        pck = match[:, j].sum() / valid[:, j].sum()
        if verbose:
            printc(OKBLUE, ' %.2f | ' % (100 * pck))
    if verbose:
        print ('')


def pck_torso(y_true, y_pred, refp=0.2):
    """ Compute the PCK (using 0.2 of the torso size) on predicted samples.

        Input:  y_true [nb_samples, nb_joints, 2]
                y_pred [nb_samples, nb_joints, 2]

        Return: The PCK score [1]
    """
    assert y_true.shape == y_pred.shape
    nb_samples, _, nb_joints = y_true.shape

    dist = np.zeros((nb_samples, nb_joints))
    valid = np.zeros((nb_samples, nb_joints))
    torso = _norm(y_true[:,:,5] - y_true[:,:,10], axis=1)

    for i in range(nb_samples):
        valid[i,:] = _valid_joints(y_true[i])
        dist[i,:] = _norm(y_true[i] - y_pred[i], axis=0) / torso[i]
    match = (dist <= refp) * valid

    return match.sum() / valid.sum()

In [None]:

def elasticnet_loss_on_valid_joints(y_true, y_pred):
    idx = K.cast(K.greater(y_true, -1e6), 'float32')
    y_true = idx * y_true
    y_pred = idx * y_pred
    l1 = K.sum(K.abs(y_pred - y_true), axis=(-2, -1))
    l2 = K.sum(K.square(y_pred - y_true), axis=(-2, -1))
    return l1 + l2


In [None]:
class TrainerOnGenerator(object):
    """This class basically is a wrapper to the method 'fit_generator' from
    Keras, despite that it also can configure user callbacks, tensorboard,
    learning rate scheduler, and model saving.

    The built-in learning rate scheduler depends on a validation callback
    with an attribute 'best_epoch'.

    # Arguments
        logdir: Path to where all the logs and weights will be saved.
    """

    def __init__(self, model, gen_tr, gpu_model=None, steps_per_epoch=None,
            initial_lr=1e-3, lr_factor=0.1, lr_patience=10, minimum_lr=1e-7,
            epochs=1, verbose=1, workers=1, shuffle=True, initial_epoch=0,
            validation_callbacks=None, custom_lr_scheduler=None,
            save_tensor_board=False, weights_fname='weights.hdf5', logdir=None):

        self.model = model
        if gpu_model is not None:
            self.gpu_model = gpu_model
        else:
            self.gpu_model = model

        self.gen_tr = gen_tr
        self.steps_per_epoch = steps_per_epoch

        self.initial_lr = initial_lr
        self.lr_factor = lr_factor
        self.lr_patience = lr_patience
        self.lr_wait = 0
        self.minimum_lr = minimum_lr

        self.epochs = epochs
        self.verbose = verbose
        self.workers = workers
        self.shuffle = shuffle
        self.initial_epoch = initial_epoch

        self.val_cb = validation_callbacks
        self.callbacks = []
        self.weights_fname = weights_fname
        self.logdir = logdir

        if self.val_cb is not None:
            if not isinstance(self.val_cb, list):
                self.val_cb = [self.val_cb]

            self.callbacks += self.val_cb

            if custom_lr_scheduler is None:
                lrscheduler = LearningRateScheduler(
                        self.learningrate_scheduler)
                self.callbacks.append(lrscheduler)

        if custom_lr_scheduler is not None:
                lrscheduler = LearningRateScheduler(custom_lr_scheduler)
                self.callbacks.append(lrscheduler)

        if (self.logdir is not None) and save_tensor_board:
            tensorboard = TensorBoard(log_dir=self.logdir)
            self.callbacks.append(tensorboard)

        if len(self.callbacks) == 0:
            self.callbacks = None # Reset if not used


    def learningrate_scheduler(self, epoch, lr):
        best_epoch = self.val_cb[-1].best_epoch
        if epoch == self.initial_epoch:
            lr = self.initial_lr

        elif best_epoch == epoch:
            self.lr_wait = 0
            if self.logdir is not None:
                self.model.save_weights(
                        os.path.join(self.logdir, self.weights_fname))
        else:
            """Increase the waiting time if it was not the best epoch."""
            self.lr_wait += 1

        if self.lr_wait >= self.lr_patience:
            self.lr_wait = 0

            """Reduce the learning rate and (re)load the best model."""
            lr *= self.lr_factor

            if self.logdir is not None:
                printcn(OKGREEN,
                        'Reloading weights from epoch %03d' % best_epoch)
                self.model.load_weights(
                        os.path.join(self.logdir, self.weights_fname))

            if lr < self.minimum_lr:
                printcn(FAIL, 'Minimum learning rate reached!')
                self.gpu_model.stop_training = True
            else:
                printcn(OKGREEN, 'Setting learning rate to: %g' % lr)

        return lr

    def train(self):
        self.gpu_model.fit_generator(self.gen_tr,
                steps_per_epoch=self.steps_per_epoch,
                epochs=self.epochs,
                verbose=self.verbose,
                callbacks=self.callbacks,
                workers=self.workers,
                use_multiprocessing=False,
                shuffle=self.shuffle,
                initial_epoch=self.initial_epoch)


class MultiModelTrainer(object):
    """This class is much more than a wrapper to the method 'fit_generator'
    from Keras. It is able to train a list o models, given a corresponding
    list of data generator (one per model), by training each model with one
    batch of its corresponding data. Actually, it is supposed that each model
    here is a small part of a bigger model (the full model), which is actually
    used for saveing weights.
    """

    def __init__(self, models, generators, workers=1, shuffle=True,
            max_queue_size=10, print_full_losses=False):

        assert len(models) == len(generators), \
                'ValueError: models and generators should be lists of same size'

        if type(workers) is not list:
            workers = len(models)*[workers]

        self.models = models
        self.output_generators = []
        self.batch_logs = {}
        self.print_full_losses = print_full_losses

        metric_names = []

        batch_size = 0
        for i in range(len(models)):
            assert isinstance(generators[i], BatchLoader), \
                    'Only BatchLoader class is supported'
            batch_size += generators[i].get_batch_size()
            enqueuer = OrderedEnqueuer(generators[i], shuffle=shuffle)
            enqueuer.start(workers=workers[i], max_queue_size=max_queue_size)
            self.output_generators.append(enqueuer.get())

            metric_names.append('loss%d' % i)
            if self.print_full_losses:
                for out in models[i].outputs:
                    metric_names.append(out.name.split('/')[0])

        self.batch_logs['size'] = batch_size
        self.metric_names = metric_names

    def train(self, epochs, steps_per_epoch, initial_epoch=0,
            end_of_epoch_callback=None, verbose=1):

        epoch = initial_epoch

        logger = ProgbarLogger(count_mode='steps')
        logger.set_params({
            'epochs': epochs,
            'steps': steps_per_epoch,
            'verbose': verbose,
            'metrics': self.metric_names})
        logger.on_train_begin()

        while epoch < epochs:
            step = 0
            batch = 0

            logger.on_epoch_begin(epoch)

            while step < steps_per_epoch:

                self.batch_logs['batch'] = batch
                logger.on_batch_begin(batch, self.batch_logs)

                for i in range(len(self.models)):
                    x, y = next(self.output_generators[i])
                    outs = self.models[i].train_on_batch(x, y)

                    if not isinstance(outs, list):
                        outs = [outs]
                    if self.print_full_losses:
                        for l, o in zip(self.metric_names, outs):
                            self.batch_logs[l] = o
                    else:
                        self.batch_logs[self.metric_names[i]] = outs[0]

                logger.on_batch_end(batch, self.batch_logs)

                step += 1
                batch += 1

            logger.on_epoch_end(epoch)
            if end_of_epoch_callback is not None:
                end_of_epoch_callback(epoch)

            epoch += 1

# **Action Recognition Task**

In [None]:
def eval_singleclip_gt_bbox(model, x_te, action_te, batch_size=1, verbose=1):

    num_blocks = len(model.outputs)
    start = time.time()

    pred = model.predict(x_te, batch_size=batch_size, verbose=verbose)
    dt = time.time() - start

    if verbose:
        printc(WARNING, 'PennAction, single-clip, action acc.%:')

    scores = []
    for b in range(num_blocks):

        y_pred = pred[b]
        correct = np.equal(np.argmax(action_te, axis=-1),
                np.argmax(y_pred, axis=-1), dtype=np.float64)
        scores.append(sum(correct) / len(correct))

        if verbose:
            printc(WARNING, ' %.1f' % (100*scores[-1]))

    if verbose:
        printcn('', '\n%d samples in %.1f sec: %.1f clips per sec' \
                % (len(x_te), dt, len(x_te) / dt))

    return scores


def eval_singleclip_gt_bbox_generator(model, datagen, verbose=1, logdir=None):

    num_blocks = len(model.outputs)
    num_samples = len(datagen)
    start = time.time()

    for i in range(num_samples):
        [x], [y] = datagen[i]
        if 'y_true' not in locals():
            y_true = np.zeros((num_samples,) + y.shape[1:])
            y_pred = np.zeros((num_samples, num_blocks) + y.shape[1:])

        y_true[i, :] = y
        pred = model.predict(x)
        for b in range(num_blocks):
            y_pred[i, b, :] = pred[b]

    dt = time.time() - start
    if verbose:
        printc(WARNING, 'PennAction, single-clip, action acc.%:')

    if logdir is not None:
        logpath = os.path.join(logdir, 'single-clip')
        mkdir(logpath)

    scores = []
    for b in range(num_blocks):
        correct = np.equal(np.argmax(y_true, axis=-1), np.argmax(y_pred[:, b, :], axis=-1))
        scores.append(sum(correct) / len(correct))
        if verbose:
            printc(WARNING, ' %.1f ' % (100*scores[-1]))

        if logdir is not None:
            np.save(logpath + '/%02d.npy' % b, correct)

    if verbose:
        printcn('', '\n%d samples in %.1f sec: %.1f clips per sec' \
                % (num_samples, dt, num_samples / dt))

    return scores


def eval_multiclip_dataset(model, penn, subsampling, bboxes_file=None,
        logdir=None, verbose=1):
    """If bboxes_file if not given, use ground truth bounding boxes."""

    num_samples = penn.get_length(TEST_MODE)
    num_blocks = len(model.outputs)

    """Save and reset some original configs from the dataset."""
    org_hflip = penn.dataconf.fixed_hflip

    cnt_corr = 0
    cnt_total = 0

    action_shape = (num_samples,) + penn.get_shape('pennaction')
    a_true = np.zeros(action_shape)
    a_pred = np.ones((num_blocks,) + action_shape)
    missing_clips = {}

    for i in range(num_samples):
        if verbose:
            printc(OKBLUE, '%04d/%04d\t' % (i, num_samples))

        frame_list = penn.get_clip_index(i, TEST_MODE, subsamples=[subsampling])

        """Variable to hold all preditions for this sequence.
        2x frame_list due to hflip.
        """
        allpred = np.ones((num_blocks, 2*len(frame_list)) + action_shape[1:])

        for f in range(len(frame_list)):
            for hflip in range(2):
                preds_clip = []
                try:
                    penn.dataconf.fixed_hflip = hflip # Force horizontal flip

                    """Load clip and predict action."""
                    data = penn.get_data(i, TEST_MODE, frame_list=frame_list[f])
                    a_true[i, :] = data['pennaction']

                    pred = model.predict(np.expand_dims(data['frame'], axis=0))
                    for b in range(num_blocks):
                        allpred[b, 2*f+hflip, :] = pred[b][0]
                        a_pred[b, i, :] *= pred[b][0]

                    if np.argmax(a_true[i]) != np.argmax(a_pred[-1, i]):
                        missing_clips['%04d.%03d.%d' % (i, f, hflip)] = [
                                int(np.argmax(a_true[i])),
                                int(np.argmax(a_pred[-1, i]))]

                except Exception as e:
                    warning('eval_multiclip, exception on sample ' \
                            + str(i) + ' frame ' + str(f) + ': ' + str(e))

        if verbose:
            cor = int(np.argmax(a_true[i]) == np.argmax(a_pred[-1, i]))

            cnt_total += 1
            cnt_corr += cor
            printnl('%d : %.1f' % (cor, 100 * cnt_corr / cnt_total))

    if logdir is not None:
        np.save('%s/allpred.npy' % logdir, allpred)
        np.save('%s/a_true.npy' % logdir, a_true)
        with open(os.path.join(logdir, 'missing-clips.json'), 'w') as fid:
            json.dump(missing_clips, fid)

    a_true = np.expand_dims(a_true, axis=0)
    a_true = np.tile(a_true, (num_blocks, 1, 1))
    correct = np.argmax(a_true, axis=-1) == np.argmax(a_pred, axis=-1)
    scores = 100*np.sum(correct, axis=-1) / num_samples
    if verbose:
        printcn(WARNING, 'PennAction, multi-clip.\n')
        printcn(WARNING, np.array2string(np.array(scores), precision=2))
        printcn(WARNING, 'PennAction best: %.2f' % max(scores))

    penn.dataconf.fixed_hflip = org_hflip

    return scores


class PennActionEvalCallback(Callback):

    def __init__(self, data, batch_size=1, eval_model=None,
            logdir=None):

        self.data = data
        self.batch_size = batch_size
        self.eval_model = eval_model
        self.scores = {}
        self.logdir = logdir

    def on_epoch_end(self, epoch, logs={}):
        if self.eval_model is not None:
            model = self.eval_model
        else:
            model = self.model

        if type(self.data) == BatchLoader:
            scores = eval_singleclip_gt_bbox_generator(model, self.data)
        else:
            scores = eval_singleclip_gt_bbox(model, self.data[0],
                    self.data[1], batch_size=self.batch_size)

        epoch += 1
        if self.logdir is not None:
            if not hasattr(self, 'logarray'):
                self.logarray = {}
            self.logarray[epoch] = scores
            with open(os.path.join(self.logdir, 'penn_val.json'), 'w') as f:
                json.dump(self.logarray, f)

        cur_best = max(scores)
        self.scores[epoch] = cur_best

        printcn(OKBLUE, 'Best score is %.1f at epoch %d' % \
                (100*self.best_score, self.best_epoch))

    @property
    def best_epoch(self):
        if len(self.scores) > 0:
            # Get the key of the maximum value from a dict
            return max(self.scores, key=self.scores.get)
        else:
            return np.inf

    @property
    def best_score(self):
        if len(self.scores) > 0:
            # Get the maximum value from a dict
            return self.scores[self.best_epoch]
        else:
            return 0

# Aliases.
eval_singleclip = eval_singleclip_gt_bbox
eval_singleclip_generator = eval_singleclip_gt_bbox_generator

# **Training both datasets**

## **Changing the neural network into VGG-16**

In [None]:
class sppnet():
  def prediction_branch(self,x, cfg, pred_activate=True, replica=None,
        forward_maps=True, name=None):

    num_pred = cfg.num_joints

    num_features = K.int_shape(x)[-1]

    x = relu(x, name=appstr(name, '_act1'))
    pred_maps = conv2d(x, num_pred, (1, 1), name=appstr(name, '_conv1'))

    if replica:
        replica = conv2d(x, num_pred, (1, 1),
                name=appstr(name, '_conv1_replica'))

    if forward_maps:
        x = conv2d(x, num_pred, (1, 1), name=appstr(name, '_fw_maps'))
        x = concatenate([x, pred_maps])
    else:
        x = pred_maps

    if pred_activate:
        x = relu(x, name=appstr(name, '_act2'))
    x = conv2d(x, num_features, (1, 1), name=appstr(name, '_conv2'))

    return x, pred_maps, replica


  def action_prediction_early_fusion(self,xa, p, c, af, cfg, name=None):

      num_actions = cfg.num_actions

      num_features = max(cfg.num_pose_features, cfg.num_visual_features)
      num_pose_features = cfg.num_pose_features
      num_visual_features = cfg.num_visual_features

      shortname = name[0:7] if name is not None else None

      action = []

      """Apply individual softmax per dataset (set of actions)."""
      def _individual_action_prediction(hlist, name=None):
          for i in range(len(hlist)):
              x = global_max_min_pooling(hlist[i])
              x = Activation('softmax', name=appstr(name, '%d' % i))(x)
              action.append(x)

      """Generic prediction block for both pose and apperance features."""
      def _prediction(x, name=None, shortname=None):
          num_features = K.int_shape(x)[-1]

          ident = x
          x = BatchNormalization(name=appstr(name, '_bn1'))(x)
          x = relu(x, name=appstr(name, '_act1'))
          x1 = conv2d(x, num_features, (3, 3), name=appstr(name, '_conv1'))

          x = max_min_pooling(x1, (2, 2))
          x = BatchNormalization(name=appstr(name, '_bn2'))(x)
          x = relu(x, name=appstr(name, '_act2'))
          hlist = []
          for i in range(len(num_actions)):
              nact = num_actions[i]
              h = conv2d(x, nact, (3, 3), name=appstr(name, '_conv2h%d' % i))
              hlist.append(h)

          _individual_action_prediction(hlist, name=shortname)
          h = concat_tensorlist(hlist)

          x = UpSampling2D((2, 2))(h)
          x = relu(x, name=appstr(name, '_act3'))
          x = conv2d(x, num_features, (3, 3), name=appstr(name, '_conv3'))
          x = add([ident, x1, x])

          return x

      """Define padding strategy."""
      num_frames, num_joints = K.int_shape(p)[1:3]
      time_stride = 2 if num_frames >= 16 else 1
      get_pad = lambda div, n: int(div*np.ceil(n / div) - n)
      joints_pad = get_pad(4, num_joints)
      frames_pad = get_pad(2 * time_stride, num_frames)
      top_pad = frames_pad // 2
      bottom_pad = (frames_pad + 1) // 2
      left_pad = joints_pad // 2
      right_pad = (joints_pad + 1) // 2

      """Pose features."""
      mask = Lambda(lambda x: K.tile(x, (1, 1, 1, K.int_shape(p)[-1])))(c)
      x = Lambda(lambda x: x[0] * x[1])([p, mask])

      a = conv2d(x, num_pose_features // 16, (3, 1),
              name=appstr(name, '_p_conv0a'))
      b = conv2d(x, num_pose_features // 8, (3, 3),
              name=appstr(name, '_p_conv0b'))
      c = conv2d(x, num_pose_features // 4, (3, 5),
              name=appstr(name, '_p_conv0c'))
      x = concatenate([a, b, c])

      x = residual(x, (3, 3), out_size=num_pose_features, convtype='normal',
              features_div=2, name=appstr(name, '_r1'))

      if top_pad + bottom_pad + left_pad + right_pad > 0:
          x = ZeroPadding2D(((top_pad, bottom_pad), (left_pad, right_pad)))(x)
      x1 = maxpooling2d(x, (2, 2), strides=(time_stride, 2))

      """Appearance features."""
      x = conv2d(af, num_visual_features, (1, 1), name=appstr(name, '_v_conv0'))

      if top_pad + bottom_pad + left_pad + right_pad > 0:
          x = ZeroPadding2D(((top_pad, bottom_pad), (left_pad, right_pad)))(x)
      x2 = maxpooling2d(x, (2, 2), strides=(time_stride, 2))

      """Feature fusion."""
      fusion = [x1, x2]
      if xa is not None:
          fusion.append(xa)

      x = concat_tensorlist(fusion)
      # x = add_tensorlist(fusion)
      x = residual(x, (3, 3), out_size=num_features, convtype='normal',
              features_div=4, name=appstr(name, '_r2'))

      xa = _prediction(x, name=appstr(name, '_pred'),
              shortname=appstr(shortname, '_a'))

      return action, xa


  def prediction_block(self,xp, xa, zp, outlist, cfg, do_action, name=None):

      dim = cfg.dim
      kernel_size = cfg.kernel_size
      xmin = cfg.xmin
      ymin = cfg.ymin
      sam_alpha = cfg.sam_alpha
      num_features = K.int_shape(xp)[-1]
      replica = cfg.pose_replica and do_action
      dbg_decoupled_pose = cfg.dbg_decoupled_pose and do_action
      dbg_decoupled_h = cfg.dbg_decoupled_h and do_action

      xp = residual(xp, kernel_size, name=appstr(name, '_r1'))
      reinject = [xp]

      xp = BatchNormalization(name=appstr(name, '_bn1'))(xp)
      xp = relu(xp, name=appstr(name, '_act1'))
      xp = sepconv2d(xp, num_features, kernel_size, name=appstr(name, '_conv1'))
      reinject.append(xp)

      xp = BatchNormalization(name=appstr(name, '_bn2'))(xp)

      """2D pose estimation."""
      x1, org_h, rep_h = self.prediction_branch(xp, cfg, pred_activate=True,
              replica=replica, name=appstr(name, '_heatmaps'))
      reinject.append(x1)

      h = Activation(channel_softmax_2d(alpha=sam_alpha),
              name=appstr(name, '_probmaps'))(org_h)

      p = softargmax2d(h, limits=(xmin, ymin, 1-xmin, 1-ymin),
              name=appstr(name, '_xy'))
      c = keypoint_confidence(h, name=appstr(name, '_vis'))

      if dbg_decoupled_pose:
          """Output decoupled poses in debug mode."""
          dbg_h = Activation(channel_softmax_2d(alpha=sam_alpha),
                  name=appstr(name, '_dbg_h'))(rep_h)
          dbg_p = softargmax2d(dbg_h, limits=(xmin, ymin, 1-xmin, 1-ymin),
                  name=appstr(name, '_dbg_xy'))

          dbg_c = keypoint_confidence(dbg_h, name=appstr(name, '_dbg_vis'))

      """Depth estimation."""
      if dim == 3:
          x1, org_d, rep_d = self.prediction_branch(xp, cfg, pred_activate=False,
                  replica=replica, forward_maps=False,
                  name=appstr(name, '_depthmaps'))
          reinject.append(x1)

          d = Activation('sigmoid')(org_d)
          z = multiply([d, h])
          z = Lambda(lambda x: K.sum(x, axis=(-2, -3)))(z)
          z = Lambda(lambda x: K.expand_dims(x, axis=-1))(z)
          p = concatenate([p, z], name=appstr(name, '_xyz'))

      """Visual features (for action only)."""
      action = []
      if do_action:
          if 'act_cnt' not in globals():
              global act_cnt
              act_cnt = 0
          act_cnt += 1
          act_name = 'act%d' % act_cnt

          act_h = rep_h if replica else org_h
          act_h = Activation(channel_softmax_2d(alpha=sam_alpha),
                  name=appstr(act_name, '_probmaps2'))(act_h)
          act_p = softargmax2d(act_h, limits=(xmin, ymin, 1-xmin, 1-ymin),
                  name=appstr(act_name, '_xy2'))
          act_c = keypoint_confidence(act_h, name=appstr(act_name, '_vis2'))

          if dim == 3:
              act_d = rep_d if replica else org_d
              act_d = Activation('sigmoid')(act_d)
              act_z = multiply([act_d, act_h])
              act_z = Lambda(lambda x: K.sum(x, axis=(-2, -3)))(act_z)
              act_z = Lambda(lambda x: K.expand_dims(x, axis=-1))(act_z)
              act_p = concatenate([act_p, act_z],
                      name=appstr(act_name, '_xyz2'))

          af = kronecker_prod(act_h, zp, name=appstr(act_name, '_kron'))

          action, xa = self.action_prediction_early_fusion(xa, act_p, act_c, af, cfg,
                  name=appstr(act_name, '_action'))

      xp = add_tensorlist(reinject)
      outlist[0].append(concatenate([p, c], name=name))
      if do_action:
          outlist[1] += action

      if dbg_decoupled_pose:
          outlist[2].append(concatenate([dbg_p, dbg_c]))
          outlist[3].append(dbg_h)

      sys.stdout.flush()

      return xp, xa


  def downscaling_pyramid(self,lp, la, lzp, outlist, cfg, do_action, name=None):

      assert len(lp) == len(la), \
              'Pose and action must have the same number of levels!'
      xp = lp[0]
      xa = la[0]
      if lzp[0] is None:
          lzp[0] = xp

      for i in range(1, len(lp)):
          num_features = K.int_shape(xp)[-1] + cfg.growth

          xp = downscaling(xp, cfg, out_size=num_features,
                  name=appstr(name, '_du%d' % i))

          if lzp[i] is None:
              lzp[i] = xp

          if lp[i] is not None:
              xp = add([xp, lp[i]])

          if xa is not None and do_action:
              xa = residual(xa, (3, 3), name=appstr(name, '_du%d_action_r0' % i))
              if la[i] is not None:
                  xa = add([xa, la[i]])

          xp, xa = self.prediction_block(xp, xa, lzp[i], outlist, cfg, do_action,
                  name=appstr(name, '_pb%d' % i))

          lp[i] = xp # lateral pose connection
          la[i] = xa # lateral action connection


  def upscaling_pyramid(self,lp, la, lzp, outlist, cfg, do_action, name=None):

      assert len(lp) == len(la), \
              'Pose and action must have the same number of levels!'
      xp = lp[-1]
      xa = la[-1]
      if lzp[0] is None:
          lzp[0] = xp

      for i in range(len(lp)-1)[::-1]:
          num_features = K.int_shape(xp)[-1] - cfg.growth

          xp = upscaling(xp, cfg, out_size=num_features,
                  name=appstr(name, '_uu%d' % i))

          if lzp[i] is None:
              lzp[i] = xp

          if lp[i] is not None:
              xp = add([xp, lp[i]])

          if xa is not None and do_action:
              xa = residual(xa, (3, 3), name=appstr(name, '_uu%d_action_r0' % i))
              if la[i] is not None:
                  xa = add([xa, la[i]])

          xp, xa = self.prediction_block(xp, xa, lzp[i], outlist, cfg, do_action,
                  name=appstr(name, '_pb%d' % i))

          lp[i] = xp # lateral pose connection
          la[i] = xa # lateral action connection

  def entry_flow(self, x, cfg):

      # Initialize ResNet50 without the top classification layer
      base_resnet = ResNet50(include_top=False, weights='imagenet')

      # Create a model to get the output of an intermediate layer (e.g., 'conv4_block6_out')
      intermediate_layer_model = Model(inputs=base_resnet.input,
                                     outputs=base_resnet.get_layer('conv4_block6_out').output)

      # Apply TimeDistributed to process each frame
      td_resnet = TimeDistributed(intermediate_layer_model)(x)

      # Freeze ResNet layers, if desired
      for layer in intermediate_layer_model.layers:
          layer.trainable = False

      # Apply additional layers if needed to adjust the size
      # Example: Upsample to increase spatial dimensions
      upsampled_output = TimeDistributed(tf.keras.layers.UpSampling2D(size=(2, 2)))(td_resnet)

      return upsampled_output


  def build(self,cfg, stop_grad_stem=False):
      """Sequential Pyramid Networks for 3D human pose estimation and
      action recognition.
      """
      assert type(cfg) == ModelConfig, \
              'type(cfg) ({}) is not ModelConfig'.format(type(cfg))

      input_shape = cfg.input_shape
      assert len(input_shape) in [3, 4], \
              'Invalid input_shape ({})'.format(input_shape)

      inp = Input(shape=input_shape)
      outlist = [] # Holds [[poses], [dbg1], [action1], [actions2], ...]
      for i in range(len(cfg.num_actions) + 1 + 2*cfg.dbg_decoupled_pose):
          outlist.append([])

      if len(input_shape) == 3:
          num_rows, num_cols, _ = input_shape
      else:
          num_frames, num_rows, num_cols, _ = input_shape

      cfg.xmin = 1 / (2 * num_cols)
      cfg.ymin = 1 / (2 * num_rows)

      x = self.entry_flow(inp, cfg)
      print(x)
      if stop_grad_stem:
          x = Lambda(lambda x: K.stop_gradient(x))(x)

      lp = []
      la = []
      lzp = []
      for i in range(cfg.num_levels):
          lp.append(None)
          la.append(None)
          lzp.append(None)

      lp[0] = x
      for pyr in range(cfg.num_pyramids):

          do_action = (pyr + 1) in cfg.action_pyramids

          if pyr % 2 == 0: # Even pyramids (0, 2, ...)
              self.downscaling_pyramid(lp, la, lzp, outlist, cfg, do_action,
                      name='dp%d' % (pyr+1))

          else: # Odd pyramids (1, 3, ...)
              self.upscaling_pyramid(lp, la, lzp, outlist, cfg, do_action,
                      name='up%d' % (pyr+1))

      outputs = []
      for o in outlist:
          outputs += o

      model = Model(inputs=inp, outputs=outputs, name='SPNet')

      return model


  def get_num_predictions(self,num_pyramids, num_levels):
      return num_pyramids * (num_levels - 1)


  def split_model(self,full_model, cfg, interlaced=False, model_names=[None, None]):

      num_pose_pred = self.get_num_predictions(cfg.num_pyramids, cfg.num_levels)
      num_act_pred = self.get_num_predictions(len(cfg.action_pyramids), cfg.num_levels)
      assert len(full_model.outputs) == \
              num_pose_pred + len(cfg.num_actions)*num_act_pred, \
              'The given model and config are not compatible!'
      assert num_act_pred > 0, 'You are trying to split a "pose only" model.'

      if interlaced:
          out_p = []
          out_a = []

          idx = 0
          for i in range(num_pose_pred):
              out_p.append(full_model.outputs[idx])
              idx += 1
              if len(out_a) < len(cfg.num_actions)*num_act_pred:
                  for aidx in range(len(cfg.num_actions)):
                      out_a.append(full_model.outputs[idx])
                      idx += 1

          modelp = Model(full_model.input, out_p, name=model_names[0])
          modela = Model(full_model.input, out_a, name=model_names[1])

      else:
          modelp = Model(full_model.input, full_model.outputs[:num_pose_pred],
                  name=model_names[0])
          modela = Model(full_model.input, full_model.outputs[num_pose_pred:],
                  name=model_names[1])

      return [modelp, modela]


  def compile_split_models(self,full_model, cfg, optimizer,
            pose_trainable=False,
            copy_replica=False,
            ar_loss_weights=0.01,
            interlaced=False,
            verbose=0):

        if copy_replica:
            copy_replica_layers(full_model)

        """Split the model into pose estination and action recognition parts."""
        models = self.split_model(full_model, cfg, interlaced=interlaced,
                model_names=['Pose', 'Action'])

        pose_loss = pose_regression_loss('l1l2bincross', 0.01)
        action_loss = 'categorical_crossentropy'

        set_trainable_layers(full_model, 'action', None, pose_trainable)
        loss_weights_pe = len(models[0].outputs) * [1.0]
        loss_weights_ar = len(models[1].outputs) * [ar_loss_weights]

        models[0].compile(loss=pose_loss, optimizer=optimizer,
                loss_weights=loss_weights_pe)
        models[1].compile(loss=action_loss, optimizer=optimizer,
                loss_weights=loss_weights_ar)

        def print_layer(self,layer, prefix=''):
            c = FAIL if layer.trainable else OKGREEN
            printc(c, prefix + '%s\t| ' % (layer.name))
            try:
                nparam = np.sum([np.prod(K.int_shape(p))
                    for p in layer._trainable_weights])
                printcn(c, prefix + '%s\t| %s\t| %d' % (str(type(layer)),
                    str(layer.output_shape), nparam))
            except:
                print('')

        if verbose:
            for i in range(2):
                printcn(HEADER, 'Model %s trainable layers:' % models[i].name)
                for m in models[i].layers:
                    print_layer(m)
                    if type(m) == TimeDistributed:
                        print_layer(m.layer, prefix='td:\t')
                    elif type(m) == Model:
                        for n in m.layers:
                            print_layer(n, prefix='>> \t')

        return models

## **CPU/GPU backend**

In [None]:
# Set to use GPU if available, otherwise use CPU
if tf.config.list_physical_devices('GPU'):
    tf.config.set_visible_devices(tf.config.list_physical_devices('GPU'), 'GPU')
    print("GPU available. Using GPU.")
else:
    tf.config.set_visible_devices([], 'GPU')
    print("No GPU available. Using CPU.")

# **Model building**

In [None]:

spnet=sppnet()
logdir = '/home/jovyan/workspace/log'
if len(sys.argv) > 1:
    logdir = sys.argv[1]
    mkdir(logdir)
    sys.stdout = open(str(logdir) + '/log.txt', 'w')

num_frames = 4
cfg = ModelConfig((num_frames,) + pennaction_dataconf.input_shape, pa16j2d,
        num_actions=[15], num_pyramids=2, action_pyramids=[1, 2],
        num_levels=4, pose_replica=False,
        num_pose_features=160, num_visual_features=160)

num_predictions = spnet.get_num_predictions(cfg.num_pyramids, cfg.num_levels)
num_action_predictions = \
        spnet.get_num_predictions(len(cfg.action_pyramids), cfg.num_levels)

start_lr = 0.001
action_weight = 0.01
batch_size_mpii = int(0.8 * num_frames)
# batch_size_penn = num_frames - batch_size_mpii
batch_size_penn = num_frames
batch_clips = 4 # 8/4

"""Load datasets"""
mpii = MpiiSinglePerson('/home/jovyan/workspace/deephar/datasets/MPII', dataconf=mpii_dataconf,
        poselayout=pa16j2d)

penn_sf = PennAction('/home/jovyan/workspace/deephar/datasets/PennAction', pennaction_pe_dataconf,
        poselayout=pa16j2d, topology='frames', use_gt_bbox=True)

penn_seq = PennAction('/home/jovyan/workspace/deephar/datasets/PennAction', pennaction_dataconf,
        poselayout=pa16j2d, topology='sequences', use_gt_bbox=True,
        clip_size=num_frames)
# pe_data_tr = BatchLoader([mpii, penn_sf], ['frame'], ['pose'], TRAIN_MODE,
pe_data_tr = BatchLoader([mpii], ['frame'], ['pose'], TRAIN_MODE,
        # batch_size=[batch_size_mpii, batch_size_penn], shuffle=True)
        batch_size=[batch_size_penn], shuffle=True)
pe_data_tr = BatchLoader(pe_data_tr, ['frame'], ['pose'], TRAIN_MODE,
        batch_size=batch_clips, num_predictions=num_predictions, shuffle=False)

ar_data_tr = BatchLoader(penn_seq, ['frame'], ['pennaction'], TRAIN_MODE,
        batch_size=batch_clips, num_predictions=num_action_predictions,
        shuffle=True)
"""Build the full model"""
full_model = spnet.build(cfg)


"""Trick to pre-load validation samples and generate the eval. callback."""
mpii_val = BatchLoader(mpii, ['frame'], ['pose', 'afmat', 'headsize'],
        VALID_MODE, batch_size=mpii.get_length(VALID_MODE), shuffle=False)
printnl('Pre-loading MPII validation data...')
[x_val], [p_val, afmat_val, head_val] = mpii_val[0]

penn_te = BatchLoader(penn_seq, ['frame'], ['pennaction'], TEST_MODE,
        batch_size=1, shuffle=False)

"""Save model callback."""
save_model = SaveModel(os.path.join(logdir,
    'weights_mpii+penn_ar_{epoch:03d}.hdf5'), model_to_save=full_model)

# **Training**

In [None]:
def prepare_training(pose_trainable, lr):
    optimizer = tf.keras.optimizers.legacy.RMSprop(learning_rate=lr)
    models = spnet.compile_split_models(full_model, cfg, optimizer,
            pose_trainable=pose_trainable, ar_loss_weights=action_weight,
            copy_replica=cfg.pose_replica)
    full_model.summary()

    """Create validation callbacks."""
    mpii_callback = MpiiEvalCallback(x_val, p_val, afmat_val, head_val,
            eval_model=models[0], pred_per_block=1, batch_size=1, logdir=logdir)
    penn_callback = PennActionEvalCallback(penn_te, eval_model=models[1],
            logdir=logdir)

    def end_of_epoch_callback(epoch):

        save_model.on_epoch_end(epoch)
        mpii_callback.on_epoch_end(epoch)
        penn_callback.on_epoch_end(epoch)

        if epoch in [15, 25]:
            lr = float(K.get_value(optimizer.lr))
            newlr = 0.1*lr
            K.set_value(optimizer.lr, newlr)
            printcn(WARNING, 'lr_scheduler: lr %g -> %g @ %d' \
                    % (lr, newlr, epoch))

    return end_of_epoch_callback, models

steps_per_epoch = mpii.get_length(TRAIN_MODE) // batch_size_mpii

fcallback, models = prepare_training(False, start_lr)

In [None]:
trainer = MultiModelTrainer(models[1:], [ar_data_tr], workers=12,
        print_full_losses=True)
trainer.train(2, steps_per_epoch=steps_per_epoch, initial_epoch=0,
        end_of_epoch_callback=fcallback)

"""Joint learning the full model."""
fcallback, models = prepare_training(True, start_lr)
trainer = MultiModelTrainer(models, [pe_data_tr, ar_data_tr], workers=12,
        print_full_losses=True)
trainer.train(30, steps_per_epoch=steps_per_epoch, initial_epoch=2,
        end_of_epoch_callback=fcallback)

# **Evaluating and testing results**

In [None]:
spnet=sppnet()

logdir = './'
if len(sys.argv) > 1:
    logdir = sys.argv[1]
    mkdir(logdir)
    sys.stdout = open(str(logdir) + '/log_eval.txt', 'w')

num_frames = 1
cfg = ModelConfig((num_frames,) + pennaction_dataconf.input_shape, pa16j2d,
        num_actions=[15], num_pyramids=6, action_pyramids=[5, 6],
        num_levels=4, pose_replica=True,
        num_pose_features=160, num_visual_features=160)

num_predictions = spnet.get_num_predictions(cfg.num_pyramids, cfg.num_levels)
num_action_predictions = \
        spnet.get_num_predictions(len(cfg.action_pyramids), cfg.num_levels)




"""Build the full model"""
full_model = spnet.build(cfg)

weights_file = '/content/drive/MyDrive/project_data/datasets/weights_mpii+penn_ar_010.hdf5'
if os.path.isfile(weights_file) == False:
    print (f'Error: file {weights_file} not found!')
    print (f'\nPlease download it from https://drive.google.com/file/d/106yIhqNN-TrI34SX81q2xbU-NczcQj6I/view?usp=sharing')
    sys.stdout.flush()
    sys.exit()

"""Load pre-trained weights from pose estimation and copy replica layers."""
full_model.load_weights(weights_file, by_name=True)

models = spnet.split_model(full_model, cfg, interlaced=False,
        model_names=['2DPose', '2DAction'])


"""Evaluate on 2D action recognition (PennAction)."""
s = eval_singleclip_generator(models[1], penn_te)
print ('Best score on PennAction (single-clip): ' + str(s))

s = eval_multiclip_dataset(models[1], penn_seq,subsampling=pennaction_dataconf.fixed_subsampling)
print ('Best score on PennAction (multi-clip): ' + str(s))

"""Evaluate on 2D pose estimation (MPII)."""
s = eval_singleperson_pckh(models[0], x_val, p_val[:, :, 0:2], afmat_val, head_val)
print ('Best score on MPII: ' + str(s))

