# Baseline

Reference Link: https://www.analyticsvidhya.com/blog/2021/10/human-pose-estimation-using-machine-learning-in-python/

In [1]:
# Preparation
import mediapipe as mp
import cv2
import time
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

mpPose = mp.solutions.pose
pose = mpPose.Pose()
mpDraw = mp.solutions.drawing_utils # For drawing keypoints
points = mpPose.PoseLandmark # Landmarks
path = "dataset/train/"
data = []
for p in points:
        x = str(p)[13:]
        data.append(x + "_x")
        data.append(x + "_y")
        data.append(x + "_z")
        data.append(x + "_vis")
data = pd.DataFrame(columns = data) # Empty dataset

In [2]:
# Creating Dataset
target = []
count = 0

for subdir, dirs, files in os.walk(path):
    for img in files:
        temp = []
        img = os.path.join(subdir, img)
        img = cv2.imread(img)

        imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        blackie = np.zeros(img.shape) # Blank image
        results = pose.process(imgRGB)

        if results.pose_landmarks:
                mpDraw.draw_landmarks(blackie, results.pose_landmarks, mpPose.POSE_CONNECTIONS) # draw landmarks on blackie
                landmarks = results.pose_landmarks.landmark

                for i,j in zip(points,landmarks):
                        temp = temp + [j.x, j.y, j.z, j.visibility]
                data.loc[count] = temp
                target.append(subdir.replace(path, ''))
                count +=1

data['target'] = target

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
Premature end of JPEG file
Corrupt JPEG data: premature end of data segment


In [3]:
# Label Encoding for target
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
data['target'] = labelencoder.fit_transform(data['target'])

In [4]:
# Displaying Dataset
data

Unnamed: 0,NOSE_x,NOSE_y,NOSE_z,NOSE_vis,LEFT_EYE_INNER_x,LEFT_EYE_INNER_y,LEFT_EYE_INNER_z,LEFT_EYE_INNER_vis,LEFT_EYE_x,LEFT_EYE_y,...,RIGHT_HEEL_vis,LEFT_FOOT_INDEX_x,LEFT_FOOT_INDEX_y,LEFT_FOOT_INDEX_z,LEFT_FOOT_INDEX_vis,RIGHT_FOOT_INDEX_x,RIGHT_FOOT_INDEX_y,RIGHT_FOOT_INDEX_z,RIGHT_FOOT_INDEX_vis,target
0,0.385088,0.702528,-0.004816,0.999651,0.364045,0.705285,-0.031445,0.999706,0.361666,0.700772,...,0.525770,0.781881,0.930616,-0.215838,0.980343,0.763475,0.904605,0.165073,0.610637,0
1,0.470336,0.691998,-0.604218,0.982091,0.445842,0.705398,-0.624718,0.987435,0.437711,0.699783,...,0.526152,0.778034,0.569747,0.541724,0.935673,0.764064,0.639336,0.467184,0.604327,0
2,0.453251,0.615995,-0.057232,0.983838,0.440873,0.630020,-0.067001,0.988656,0.440118,0.630212,...,0.552748,0.769751,0.797690,0.004431,0.937697,0.743715,0.761387,0.285678,0.625030,0
3,0.401504,0.383240,-0.309374,0.984927,0.436412,0.378602,-0.318324,0.989338,0.404689,0.379723,...,0.591222,0.622063,0.713210,-0.035523,0.916048,0.569005,0.763766,-0.045453,0.652138,0
4,0.450490,0.683425,-0.067524,0.986200,0.446467,0.690085,-0.090036,0.990196,0.426712,0.690337,...,0.598638,0.656271,0.870245,0.077734,0.911403,0.668724,0.867453,0.307724,0.658130,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954,0.242243,0.467066,-0.179606,0.983067,0.225128,0.474538,-0.151332,0.981929,0.224699,0.476763,...,0.849715,0.841590,0.793305,0.051033,0.844406,0.837034,0.790180,-0.062608,0.853069,2
955,0.512464,0.721425,-0.543747,0.984726,0.510773,0.734221,-0.539054,0.983721,0.513721,0.735019,...,0.864014,0.547789,0.900536,0.296631,0.854198,0.317296,0.904507,-0.350878,0.866556,2
956,0.041084,0.480462,-0.290565,0.985741,-0.001317,0.440492,-0.271295,0.982165,0.000684,0.434142,...,0.848213,0.983486,0.812803,0.070410,0.865286,0.972812,0.841870,-0.066042,0.875791,2
957,0.276817,0.443246,-0.009695,0.987157,0.435037,0.453263,-0.042767,0.983942,0.434067,0.460111,...,0.859729,0.663250,0.575939,-0.073409,0.870568,0.707136,0.135086,-0.096244,0.876942,2


In [5]:
# Traning the baseline model with SVM
from sklearn.svm import SVC
X,Y = data.iloc[:,:132],data['target']
model = SVC(kernel = 'poly')
model.fit(X,Y)

SVC(kernel='poly')

In [6]:
# Predicting test 
test_path = "dataset/test/"
y_pred = []
y_test = []
for subdir, dirs, files in os.walk(path):
    for img in files:
        temp = []
        img = os.path.join(subdir, img)
        img = cv2.imread(img)

        imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        blackie = np.zeros(img.shape) # Blank image
        results = pose.process(imgRGB)

        if results.pose_landmarks:
                mpDraw.draw_landmarks(blackie, results.pose_landmarks, mpPose.POSE_CONNECTIONS) # draw landmarks on blackie
                landmarks = results.pose_landmarks.landmark

                for i,j in zip(points,landmarks):
                        temp = temp + [j.x, j.y, j.z, j.visibility]
                y_pred.append(model.predict([temp]))
                y_test.append(labelencoder.transform([subdir.replace(path, '')])[0])

Premature end of JPEG file
Corrupt JPEG data: premature end of data segment


In [7]:
# Evaluating the baseline model (SVM)
from sklearn.metrics import classification_report

target_names = labelencoder.classes_
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     downdog       0.99      0.97      0.98       196
     goddess       0.93      0.84      0.88       164
       plank       0.94      0.98      0.96       225
        tree       0.97      0.85      0.91       136
    warrior2       0.86      0.96      0.91       238

    accuracy                           0.93       959
   macro avg       0.94      0.92      0.93       959
weighted avg       0.93      0.93      0.93       959



# Implementing YOLOX

In [21]:
import cv2
from pathlib import Path

import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

from yolox.data_augment import preproc
from yolox.yolox import YOLOX, get_model, IdentityModule

In [2]:
# YOLOX Configuration
class dotdict(dict):
    """
    Dotdict is just a dictionary whose elements can be referenced with a dot operation.
    I.e. dotdict['x'] == dotdict.x

    This is useful because the original YOLOX used a custom class to hold a lot of extra configuration that
    we do not need.
    """
    def __getattr__(self, x):
        return self['x']


opt = dotdict()
# All images should be scaled to this input size before passing through YOLOX.
# Any image (of any size) can be scaled using the function `yolox.data_augment.preproc`
# I don't recommend changing this. This is just fine and loads pretty quickly, even on CPU.
opt.input_size = (640, 640)
opt.random_size = (10, 20)  # None; multi-size train: from 448(14*32) to 832(26*32), set None to disable it
opt.test_size = (640, 640)
opt.rgb_means = [0.485, 0.456, 0.406]
opt.std = [0.229, 0.224, 0.225]
opt.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
opt.backbone = "CSPDarknet-nano"
opt.depth_wise = True
opt.use_amp = False  # True, Automatic mixed precision

In [25]:
from typing import List


# Load YOLOX (Including weights pretrained on COCO)

# The head (i.e. the connection between the YOLOX backbone and neck to the rest of the model) is by default just an IdentityModule.
# This head should be exchanged with some torch module that performs the rest of the function (in this case classification)
# The head module should be a torch module expecting an input that is a list of 3 tensors of sizes:
#        [torch.Size([BATCH_SIZE, 64, 80, 80]), torch.Size([BATCH_SIZE, 128, 40, 40]), torch.Size([BATCH_SIZE, 256, 20, 20])]
# Note: These sizes may change if the `opt.input_size` or `opt.test_size` are changed.
# Each of these inputs is a different output of the YOLOX neck and represents the features learned at various scales.

# The YOLOX model expects a single tensor input of size: [BATCH_SIZE, 3, opt.test_size[0], opt.test_size[1]]
# BATCHSIZE is the Batch size
# 3 is the number of color channels (the YOLOX is pretrained on 3 channels. Even if the image is grayscale, convert it to RGB
# opt.test_size[0] is the number of horizontal pixels in the input
# opt.test_size[1] is the number of vertical pixels in the input

class ClassificationHead(nn.Module):
    def __init__(self, input_sizes:List[int], input_channels:List[int], num_classes:int, hidden_features:int = 128):
        super(ClassificationHead, self).__init__()
        self.fc0a = nn.Linear(input_channels[0]*input_sizes[0]**2,hidden_features)
        self.fc0b = nn.Linear(input_channels[1]*input_sizes[1]**2,hidden_features)
        self.fc0c = nn.Linear(input_channels[2]*input_sizes[2]**2,hidden_features)
        # Concatenate the three outputs into one linear layer
        self.fc1 = nn.Linear(len(input_sizes) * hidden_features, hidden_features)
        self.fc2 = nn.Linear(hidden_features, hidden_features)
        self.fc3 = nn.Linear(hidden_features, num_classes)

    def forward(self,x):
        a = F.relu(self.fc0a(torch.flatten(x[0],1)))
        b = F.relu(self.fc0b(torch.flatten(x[1],1)))
        c = F.relu(self.fc0c(torch.flatten(x[2],1)))
        x = torch.cat([a,b,c], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # x = F.softmax(self.fc3(x), dim=1)
        x = self.fc3(x)
        return x

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eps = 1e-3
                m.momentum = 0.03


model = get_model(opt,
                  # head=IdentityModule(),
                  head=ClassificationHead([80,40,20], [64,128,256], 5),
                  freeze_layers=True)

# Check if frozen
assert not any(p.requires_grad for p in model.backbone.parameters())

==>> loaded pretrained_models/yolox-nano.pth, epoch 294
--> Drop parameter head.stems.0.conv.weight.
--> Drop parameter head.stems.0.bn.weight.
--> Drop parameter head.stems.0.bn.bias.
--> Drop parameter head.stems.0.bn.running_mean.
--> Drop parameter head.stems.0.bn.running_var.
--> Drop parameter head.stems.0.bn.num_batches_tracked.
--> Drop parameter head.stems.1.conv.weight.
--> Drop parameter head.stems.1.bn.weight.
--> Drop parameter head.stems.1.bn.bias.
--> Drop parameter head.stems.1.bn.running_mean.
--> Drop parameter head.stems.1.bn.running_var.
--> Drop parameter head.stems.1.bn.num_batches_tracked.
--> Drop parameter head.stems.2.conv.weight.
--> Drop parameter head.stems.2.bn.weight.
--> Drop parameter head.stems.2.bn.bias.
--> Drop parameter head.stems.2.bn.running_mean.
--> Drop parameter head.stems.2.bn.running_var.
--> Drop parameter head.stems.2.bn.num_batches_tracked.
--> Drop parameter head.cls_convs.0.0.dconv.conv.weight.
--> Drop parameter head.cls_convs.0.0.dco

In [5]:
# Load Images
img_dir = 'imgs/'
images = [cv2.imread(str(im)) for im in Path(img_dir).glob('*.jpg')]
print(f'There are {len(images)} images')
inp_imgs = np.zeros([len(images), 3, opt.test_size[0], opt.test_size[1]], dtype=np.float32)
for b_i, image in enumerate(images):
    img, r = preproc(image, opt.test_size, opt.rgb_means, opt.std)
    inp_imgs[b_i] = img

inp_imgs = torch.from_numpy(inp_imgs).to(opt.device)
print(f'Input image batch of shape: {inp_imgs.shape}')

There are 2 images
Input image batch of shape: torch.Size([2, 3, 640, 640])


In [24]:
# Run inference as a test to make sure network runs.
with torch.no_grad():
    yolo_outputs = model(inp_imgs)
    # print(yolo_outputs)
    print(yolo_outputs.shape)

torch.Size([2, 10])


## TODO: Implement Network Head for classification (probably some conv layers and a few fully connected layers)

In [None]:
from keras.models import Sequential
from keras.layers import Dense

## Creating model

model = Sequential()
model.add(Dense(12, input_dim=8, activation="relu"))
model.add(Dense(12, activation="relu"))
model.add(Dense(1, activation="sigmoid"))


model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

## TODO: Implement a custom training loop
As long as `get_model` is called with `freeze_layers=True`, the early layers (the YOLOX pretrained ones) will be frozen, so training should be fast--only the head needs to be trained.

In [None]:
#model.fit(x,y, epochs=150, batch_size=10)
    