# Baseline

Reference Link: https://www.analyticsvidhya.com/blog/2021/10/human-pose-estimation-using-machine-learning-in-python/

In [82]:
# Preparation
import mediapipe as mp
import cv2
import time
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

mpPose = mp.solutions.pose
pose = mpPose.Pose()
mpDraw = mp.solutions.drawing_utils # For drawing keypoints
points = mpPose.PoseLandmark # Landmarks
path = "dataset/train/"
data = []
for p in points:
        x = str(p)[13:]
        data.append(x + "_x")
        data.append(x + "_y")
        data.append(x + "_z")
        data.append(x + "_vis")
data = pd.DataFrame(columns = data) # Empty dataset

In [None]:
# Creating Dataset
target = []
count = 0

for subdir, dirs, files in os.walk(path):
    for img in files:
        temp = []
        img = os.path.join(subdir, img)
        img = cv2.imread(img)

        imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        blackie = np.zeros(img.shape) # Blank image
        results = pose.process(imgRGB)

        if results.pose_landmarks:
                mpDraw.draw_landmarks(blackie, results.pose_landmarks, mpPose.POSE_CONNECTIONS) # draw landmarks on blackie
                landmarks = results.pose_landmarks.landmark

                for i,j in zip(points,landmarks):
                        temp = temp + [j.x, j.y, j.z, j.visibility]
                data.loc[count] = temp
                target.append(subdir.replace(path, ''))
                count +=1

data['target'] = target

Premature end of JPEG file


In [None]:
# Label Encoding for target
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
data['target'] = labelencoder.fit_transform(data['target'])

In [None]:
# Displaying Dataset
data.head(5)

## Exploratory Data Analysis

In [None]:
data.describe(include = 'all')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt


label_df = pd.DataFrame()
label_df['label'] = list(map(lambda x: labelencoder.inverse_transform([x])[0], data['target']))

bars = alt.Chart(label_df).mark_bar(size=50).encode(
    x=alt.X('label', axis=alt.Axis(title='Pose')),
    y=alt.Y("count()", axis=alt.Axis(title='Count')),
    tooltip=[alt.Tooltip('count()', title='Count'), 'label'],
    color='label'
)

(bars).interactive().properties(
    height=300, 
    width=700,
    title = "Number of data in each pose",
)


In [None]:
# Traning the baseline model with SVM
from sklearn.svm import SVC
X,Y = data.iloc[:,:132],data['target']
model = SVC(kernel = 'poly')
model.fit(X,Y)

In [None]:
# Predicting test 
test_path = "dataset/test/"
y_pred = []
y_test = []
for subdir, dirs, files in os.walk(path):
    for img in files:
        temp = []
        img = os.path.join(subdir, img)
        img = cv2.imread(img)

        imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        blackie = np.zeros(img.shape) # Blank image
        results = pose.process(imgRGB)

        if results.pose_landmarks:
                mpDraw.draw_landmarks(blackie, results.pose_landmarks, mpPose.POSE_CONNECTIONS) # draw landmarks on blackie
                landmarks = results.pose_landmarks.landmark

                for i,j in zip(points,landmarks):
                        temp = temp + [j.x, j.y, j.z, j.visibility]
                y_pred.append(model.predict([temp]))
                y_test.append(labelencoder.transform([subdir.replace(path, '')])[0])

In [None]:
# Evaluating the baseline model (SVM)
from sklearn.metrics import classification_report

target_names = labelencoder.classes_
print(classification_report(y_test, y_pred, target_names=target_names))

# Implementing YOLOX

In [21]:
import cv2
from pathlib import Path

import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

from yolox.data_augment import preproc
from yolox.yolox import YOLOX, get_model, IdentityModule

In [2]:
# YOLOX Configuration
class dotdict(dict):
    """
    Dotdict is just a dictionary whose elements can be referenced with a dot operation.
    I.e. dotdict['x'] == dotdict.x

    This is useful because the original YOLOX used a custom class to hold a lot of extra configuration that
    we do not need.
    """
    def __getattr__(self, x):
        return self['x']


opt = dotdict()
# All images should be scaled to this input size before passing through YOLOX.
# Any image (of any size) can be scaled using the function `yolox.data_augment.preproc`
# I don't recommend changing this. This is just fine and loads pretty quickly, even on CPU.
opt.input_size = (640, 640)
opt.random_size = (10, 20)  # None; multi-size train: from 448(14*32) to 832(26*32), set None to disable it
opt.test_size = (640, 640)
opt.rgb_means = [0.485, 0.456, 0.406]
opt.std = [0.229, 0.224, 0.225]
opt.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
opt.backbone = "CSPDarknet-nano"
opt.depth_wise = True
opt.use_amp = False  # True, Automatic mixed precision

In [25]:
from typing import List


# Load YOLOX (Including weights pretrained on COCO)

# The head (i.e. the connection between the YOLOX backbone and neck to the rest of the model) is by default just an IdentityModule.
# This head should be exchanged with some torch module that performs the rest of the function (in this case classification)
# The head module should be a torch module expecting an input that is a list of 3 tensors of sizes:
#        [torch.Size([BATCH_SIZE, 64, 80, 80]), torch.Size([BATCH_SIZE, 128, 40, 40]), torch.Size([BATCH_SIZE, 256, 20, 20])]
# Note: These sizes may change if the `opt.input_size` or `opt.test_size` are changed.
# Each of these inputs is a different output of the YOLOX neck and represents the features learned at various scales.

# The YOLOX model expects a single tensor input of size: [BATCH_SIZE, 3, opt.test_size[0], opt.test_size[1]]
# BATCHSIZE is the Batch size
# 3 is the number of color channels (the YOLOX is pretrained on 3 channels. Even if the image is grayscale, convert it to RGB
# opt.test_size[0] is the number of horizontal pixels in the input
# opt.test_size[1] is the number of vertical pixels in the input

class ClassificationHead(nn.Module):
    def __init__(self, input_sizes:List[int], input_channels:List[int], num_classes:int, hidden_features:int = 128):
        super(ClassificationHead, self).__init__()
        self.fc0a = nn.Linear(input_channels[0]*input_sizes[0]**2,hidden_features)
        self.fc0b = nn.Linear(input_channels[1]*input_sizes[1]**2,hidden_features)
        self.fc0c = nn.Linear(input_channels[2]*input_sizes[2]**2,hidden_features)
        # Concatenate the three outputs into one linear layer
        self.fc1 = nn.Linear(len(input_sizes) * hidden_features, hidden_features)
        self.fc2 = nn.Linear(hidden_features, hidden_features)
        self.fc3 = nn.Linear(hidden_features, num_classes)

    def forward(self,x):
        a = F.relu(self.fc0a(torch.flatten(x[0],1)))
        b = F.relu(self.fc0b(torch.flatten(x[1],1)))
        c = F.relu(self.fc0c(torch.flatten(x[2],1)))
        x = torch.cat([a,b,c], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # x = F.softmax(self.fc3(x), dim=1)
        x = self.fc3(x)
        return x

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eps = 1e-3
                m.momentum = 0.03


model = get_model(opt,
                  # head=IdentityModule(),
                  head=ClassificationHead([80,40,20], [64,128,256], 5),
                  freeze_layers=True)

# Check if frozen
assert not any(p.requires_grad for p in model.backbone.parameters())

==>> loaded pretrained_models/yolox-nano.pth, epoch 294
--> Drop parameter head.stems.0.conv.weight.
--> Drop parameter head.stems.0.bn.weight.
--> Drop parameter head.stems.0.bn.bias.
--> Drop parameter head.stems.0.bn.running_mean.
--> Drop parameter head.stems.0.bn.running_var.
--> Drop parameter head.stems.0.bn.num_batches_tracked.
--> Drop parameter head.stems.1.conv.weight.
--> Drop parameter head.stems.1.bn.weight.
--> Drop parameter head.stems.1.bn.bias.
--> Drop parameter head.stems.1.bn.running_mean.
--> Drop parameter head.stems.1.bn.running_var.
--> Drop parameter head.stems.1.bn.num_batches_tracked.
--> Drop parameter head.stems.2.conv.weight.
--> Drop parameter head.stems.2.bn.weight.
--> Drop parameter head.stems.2.bn.bias.
--> Drop parameter head.stems.2.bn.running_mean.
--> Drop parameter head.stems.2.bn.running_var.
--> Drop parameter head.stems.2.bn.num_batches_tracked.
--> Drop parameter head.cls_convs.0.0.dconv.conv.weight.
--> Drop parameter head.cls_convs.0.0.dco

In [5]:
# Load Images
img_dir = 'imgs/'
images = [cv2.imread(str(im)) for im in Path(img_dir).glob('*.jpg')]
print(f'There are {len(images)} images')
inp_imgs = np.zeros([len(images), 3, opt.test_size[0], opt.test_size[1]], dtype=np.float32)
for b_i, image in enumerate(images):
    img, r = preproc(image, opt.test_size, opt.rgb_means, opt.std)
    inp_imgs[b_i] = img

inp_imgs = torch.from_numpy(inp_imgs).to(opt.device)
print(f'Input image batch of shape: {inp_imgs.shape}')

There are 2 images
Input image batch of shape: torch.Size([2, 3, 640, 640])


In [24]:
# Run inference as a test to make sure network runs.
with torch.no_grad():
    yolo_outputs = model(inp_imgs)
    # print(yolo_outputs)
    print(yolo_outputs.shape)

torch.Size([2, 10])


## TODO: Implement a custom training loop
As long as `get_model` is called with `freeze_layers=True`, the early layers (the YOLOX pretrained ones) will be frozen, so training should be fast--only the head needs to be trained.

In [None]:
# Load Images
while not stopping_condition:
    # Take input
    # Send it through the model
    # Calculate the loss
    # Run back propagation
    pass


