# Baseline

Reference Link: https://www.analyticsvidhya.com/blog/2021/10/human-pose-estimation-using-machine-learning-in-python/

In [None]:
import mediapipe as mp
import cv2
import time
import numpy as np
import pandas as pd
import os

mpPose = mp.solutions.pose
pose = mpPose.Pose()
mpDraw = mp.solutions.drawing_utils # For drawing keypoints
points = mpPose.PoseLandmark # Landmarks
path = "DATASET/TRAIN/plank" # enter dataset path
data = []
for p in points:
        x = str(p)[13:]
        data.append(x + "_x")
        data.append(x + "_y")
        data.append(x + "_z")
        data.append(x + "_vis")
data = pd.DataFrame(columns = data) # Empty dataset

In [None]:
data

Unnamed: 0,NOSE_x,NOSE_y,NOSE_z,NOSE_vis,LEFT_EYE_INNER_x,LEFT_EYE_INNER_y,LEFT_EYE_INNER_z,LEFT_EYE_INNER_vis,LEFT_EYE_x,LEFT_EYE_y,...,RIGHT_HEEL_z,RIGHT_HEEL_vis,LEFT_FOOT_INDEX_x,LEFT_FOOT_INDEX_y,LEFT_FOOT_INDEX_z,LEFT_FOOT_INDEX_vis,RIGHT_FOOT_INDEX_x,RIGHT_FOOT_INDEX_y,RIGHT_FOOT_INDEX_z,RIGHT_FOOT_INDEX_vis


# Implementing YOLOX

In [2]:
import cv2
from pathlib import Path

import numpy as np
import torch
from torch import nn

from yolox.data_augment import preproc
from yolox.yolox import YOLOX, get_model, IdentityModule

In [3]:
# YOLOX Configuration
class dotdict(dict):
    """
    Dotdict is just a dictionary whose elements can be referenced with a dot operation.
    I.e. dotdict['x'] == dotdict.x

    This is useful because the original YOLOX used a custom class to hold a lot of extra configuration that
    we do not need.
    """
    def __getattr__(self, x):
        return self['x']


opt = dotdict()
# All images should be scaled to this input size before passing through YOLOX.
# Any image (of any size) can be scaled using the function `yolox.data_augment.preproc`
# I don't recommend changing this. This is just fine and loads pretty quickly, even on CPU.
opt.input_size = (640, 640)
opt.random_size = (10, 20)  # None; multi-size train: from 448(14*32) to 832(26*32), set None to disable it
opt.test_size = (640, 640)
opt.rgb_means = [0.485, 0.456, 0.406]
opt.std = [0.229, 0.224, 0.225]
opt.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
opt.backbone = "CSPDarknet-nano"
opt.depth_wise = True
opt.use_amp = False  # True, Automatic mixed precision

In [11]:
# Load YOLOX (Including weights pretrained on COCO)

# The head (i.e. the connection between the YOLOX backbone and neck to the rest of the model) is by default just an IdentityModule.
# This head should be exchanged with some torch module that performs the rest of the function (in this case classification)
# The head module should be a torch module expecting an input that is a list of 3 tensors of sizes:
#        [torch.Size([BATCH_SIZE, 64, 80, 80]), torch.Size([BATCH_SIZE, 128, 40, 40]), torch.Size([BATCH_SIZE, 256, 20, 20])]
# Note: These sizes may change if the `opt.input_size` or `opt.test_size` are changed.
# Each of these inputs is a different output of the YOLOX neck and represents the features learned at various scales.

# The YOLOX model expects a single tensor input of size: [BATCH_SIZE, 3, opt.test_size[0], opt.test_size[1]]
# BATCHSIZE is the Batch size
# 3 is the number of color channels (the YOLOX is pretrained on 3 channels. Even if the image is grayscale, convert it to RGB
# opt.test_size[0] is the number of horizontal pixels in the input
# opt.test_size[1] is the number of vertical pixels in the input

model = get_model(opt,
                  head=IdentityModule(),
                  freeze_layers=True)

# Check if frozen
assert not all(p.requires_grad for p in model.backbone.parameters())

==>> loaded pretrained_models/yolox-nano.pth, epoch 294
--> Drop parameter head.stems.0.conv.weight.
--> Drop parameter head.stems.0.bn.weight.
--> Drop parameter head.stems.0.bn.bias.
--> Drop parameter head.stems.0.bn.running_mean.
--> Drop parameter head.stems.0.bn.running_var.
--> Drop parameter head.stems.0.bn.num_batches_tracked.
--> Drop parameter head.stems.1.conv.weight.
--> Drop parameter head.stems.1.bn.weight.
--> Drop parameter head.stems.1.bn.bias.
--> Drop parameter head.stems.1.bn.running_mean.
--> Drop parameter head.stems.1.bn.running_var.
--> Drop parameter head.stems.1.bn.num_batches_tracked.
--> Drop parameter head.stems.2.conv.weight.
--> Drop parameter head.stems.2.bn.weight.
--> Drop parameter head.stems.2.bn.bias.
--> Drop parameter head.stems.2.bn.running_mean.
--> Drop parameter head.stems.2.bn.running_var.
--> Drop parameter head.stems.2.bn.num_batches_tracked.
--> Drop parameter head.cls_convs.0.0.dconv.conv.weight.
--> Drop parameter head.cls_convs.0.0.dco

In [12]:
# Load Images
img_dir = 'imgs/'
images = [cv2.imread(str(im)) for im in Path(img_dir).glob('*.jpg')]
print(f'There are {len(images)} images')
inp_imgs = np.zeros([len(images), 3, opt.test_size[0], opt.test_size[1]], dtype=np.float32)
for b_i, image in enumerate(images):
    img, r = preproc(image, opt.test_size, opt.rgb_means, opt.std)
    inp_imgs[b_i] = img

inp_imgs = torch.from_numpy(inp_imgs).to(opt.device)
print(f'Input image batch of shape: {inp_imgs.shape}')

There are 2 images
Input image batch of shape: torch.Size([2, 3, 640, 640])


In [13]:
# Run inference as a test to make sure network runs.
with torch.no_grad():
    yolo_outputs = model(inp_imgs)
    # print(yolo_outputs)
    print(len(yolo_outputs))
    print([t.shape for t in yolo_outputs])



3
[torch.Size([2, 64, 80, 80]), torch.Size([2, 128, 40, 40]), torch.Size([2, 256, 20, 20])]
