# Prerequisites

In [0]:
# installing Facebook's detectron 2 for person detector
!pip install -U torch==1.4+cu100 torchvision==0.5+cu100 -f https://download.pytorch.org/whl/torch_stable.html 
!pip install cython pyyaml==5.1
!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu100/index.html

!git clone https://github.com/facebookresearch/detectron2.git

# This is my fork of HRNet, this is the same as the official repo, minus some dependencies
# The official repo will work just as well, you'll just have to make sure to install requirements  
!git clone https://github.com/ramarlina/Higher-HRNet-Human-Pose-Estimation.git

# downloading pretrained weights from the official Google Drive repository
!gdown https://drive.google.com/uc?id=1V9Iz0ZYy9m8VeaspfKECDW0NKlGsYmO1


In [0]:
# adding repo to python's paths since we're not going to install it
import sys 
sys.path.append("Higher-HRNet-Human-Pose-Estimation/lib")

# Creating a HRNet Pose Estimation model

Some custom code for parsing the yaml config file:

In [0]:
import json 
import yaml

# Loading the yaml file
config_file = "Higher-HRNet-Human-Pose-Estimation/experiments/coco/higher_hrnet/w32_512_adam_lr1e-3.yaml"
config_json = yaml.load(open(config_file))
 
def walk(node):
    obj = {}
    for key, item in node.items():
        if isinstance(item, dict): 
            obj[key] = ConfigParser(item)
        else:
            obj[key] = item
    return obj

# Custom parser class 
class ConfigParser():
    def __init__(self, cfg_json): 
        self.__dict__ = walk(cfg_json) 

    def __getitem__(self, idx):
        return self.__dict__[idx]

    def __setitem__(self, key, value):
        self.__dict__[key] = value

    def __repr__(self):
        return json.dumps(list(self.__dict__.keys()))

config = ConfigParser(config_json)
 
print("Num Joints: ", config.MODEL.NUM_JOINTS)

Instantiating the model

In [0]:
from models.pose_higher_hrnet import PoseHigherResolutionNet 
import torch

# set this to "cuda" to use GPU
device = "cuda" 

# creating the model
model = PoseHigherResolutionNet(config).to(device)

Loading pre-trained weights from the official Google Drive repo

In [0]:
# loading weights
state_dict = torch.load("./pose_higher_hrnet_w32_512.pth")
model.load_state_dict(state_dict)

# Inference

Helper functions for loading and preprocessing of an image and for predicting pose using the model

In [0]:
from utils.transforms import resize_align_multi_scale 
from utils.transforms import get_multi_scale_size
import cv2
import torchvision
import numpy as np
from matplotlib import pyplot as plt 
from scipy.ndimage import gaussian_filter

transforms = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
])
 
def preproc_image(image, resolution=(512,512)): 
    if isinstance(image, str):
        image = cv2.imread(image)   

    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 

    image_resized = cv2.resize(image, resolution)

    image_resized = transforms(image_resized)

    image_resized = image_resized.unsqueeze(0)
    return image, image_resized

def predict(model, X, original_size): 
    model.eval()
    outputs = model(X)

    n_joints = outputs[-1].shape[1]

    hm = 0
    for i, output in enumerate(outputs): 
        output = torch.nn.functional.interpolate(
            output,
            size=(original_size[0], original_size[1]),
            mode='bilinear',
            align_corners=False
        )
        hm += output[:, :n_joints].detach().cpu().numpy()

    hm /= 2 

    pts = np.zeros((n_joints, 3)) 

    for i, joint in enumerate(hm[0]):  
        pt = np.unravel_index(np.argmax(joint), joint.shape)
        pts[i:, :2] = pt[::-1]   
        pts[i:, 2] = joint[pt] 
        
    return pts

def visualize_pose(image, pts):
    """
        Visualizing predicted poses
    """
    skeleton = [ 
        [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],
        [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],  # [3, 5], [4, 6]
        [0, 5], [0, 6]
    ] 

    for i, joint in enumerate(skeleton):
        pt1, pt2 = pts[joint] 
        if pt1[2] > 0.1 and pt2[2] > 0.1:
            image = cv2.line(
                image, (int(pt1[0]), int(pt1[1])), (int(pt2[0]), int(pt2[1])),
                (0,255,0), 5
            )

    for pt in pts:
        if pt[2] > 0.1:
            image = cv2.circle(image, (int(pt[0]), int(pt[1])), 10, (255,0,0), -1)

    return image 
 

# Predicting body pose in video

In [0]:
# import some common detectron2 utilities
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg 

# Create config
cfg = get_cfg()
cfg.merge_from_file("./detectron2/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 
cfg.MODEL.WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl"

# Create detector
detector = DefaultPredictor(cfg)

Video by Wolfgang Sauerwald from Pexels:

https://www.pexels.com/video/dancing-on-the-street-3608987/

In [0]:
!wget https://static.haizaha.com/dancing-on-the-street-3608987.mp4

In [0]:
class TempPoseAggregator():
    def __init__(self, span):
        self.pts = []
        self.frames = []
        self.span = span 

    def add(self, frame, pts):
        self.frames.append(frame)
        self.pts.append(pts)

        if len(self.frames) == (self.span) :
            frame = self.frames[self.span//2]
            pts = np.array(self.pts).mean(0) 
            self.frames = self.frames[1:]
            self.pts = self.pts[1:]
            return frame, pts
        else:
            return None, None

In [0]:
from tqdm import tqdm

cap = cv2.VideoCapture('dancing-on-the-street-3608987.mp4')

fps = cap.get(cv2.CAP_PROP_FPS)

fourcc = cv2.VideoWriter_fourcc(*'XVID')  
out = cv2.VideoWriter('output.avi', fourcc, fps, (1920,1080))

model.cuda()

pbar = tqdm(total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)))

aggregator = TempPoseAggregator(span=5)

while(cap.isOpened()):
    pbar.update(1)

    ret, frame = cap.read()
    
    if(ret == False):
        break

    pose = np.zeros(frame.shape)
    viz = np.array(frame) 

    # detecting people within frame
    pred_detection = detector(frame)
    boxes   = list(pred_detection["instances"].pred_boxes)
    classes = list(pred_detection["instances"].pred_classes)
    scores  = list(pred_detection["instances"].scores)
    i=0    
    box, cid, score = boxes[i], classes[i], scores[i]

    # centering person box
    box = box.detach().cpu().numpy().astype("i")[[1,0,3,2]] 
    h, w, c = box[2]-box[0], box[3]-box[1], (box[[0,2]].sum() // 2, box[[1,3]].sum() // 2)
    r = max(w, h) // 2 + 100
    box = np.array([c[0] - r, c[1] - r, c[0] + r, c[1] + r])
    box[[0,2]] = np.clip(box[[0,2]], 0, frame.shape[0])
    box[[1,3]] = np.clip(box[[1,3]], 0, frame.shape[1])   

    # cropping, transforming, and tensorifying image
    image, X = preproc_image(frame[box[0]:box[2], box[1]:box[3]]) 

    # detecting pose
    pts = predict(model, X.cuda(), image.shape[:-1])
    
    # offsetting point coordinates with respect to box
    pts[:,:2] += box[[1,0]]

    viz, pts = aggregator.add(viz, pts)

    if pts is not None:

        # drawing points and skeleton
        viz = visualize_pose(viz, pts)
        pose = visualize_pose(pose, pts)
        #viz[:360, :640] = cv2.resize(pose, (640,360))
            
    out.write(viz) 
        
cap.release()
out.release()
cv2.destroyAllWindows()

In [0]:
# converting the video to mp4
!ffmpeg -i output.avi output_ma.mp4

In [0]:
1920//4, viz.shape