In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.vision import *
from fastai.vision import image as im
import torch.nn as nn
from torch.nn.functional import mse_loss
import json
import re

In [3]:
path= 'path_to_your_dataset'    # You need to specify the path to your dataset
                                # you can find many datasets out there. the best would be to annotate your own dataset, 
                                # but it's too time consuming. if like me, you are intrested to learn the dynamics 
                                # at the heart of computure vision, you can pick one of the datasets available out there.
                                # i won't matter.  

In [4]:
# setting the transformations to perform on your dataset
transforms = get_transforms(do_flip=False, max_zoom=1.1, max_warp=0.01,max_rotate=45) 
# This function gets hand's coordinates from a json file. Very important!!, in fastai "y" is expected to be first before "x" 
def get_y_func(x):
     pre, ext = os.path.splitext(x)
     img = open_image(x)
     coords = []
     for k in json.load(open(pre + '.json'))['hand_keypoints']:
        coords.append([k[1],k[0]]) # inverting x and y
     return torch.tensor(coords,dtype=torch.float)

In [6]:
# setting your data_block
data = (PointsItemList.from_folder(path=path, extensions=['.jpg'])
        .split_by_folder(train='train', valid='test') #  setting training and testing dataset folders paths
        .label_from_func(get_y_func)  # using get_y_func() to get coordinates for each image
        .transform(transforms,size=224, tfm_y=True, remove_out=False,  # very important!!!: setting remove_out to False, 
                                                                       # prevents from discarding coordinates that may 
                                                                       # disappear after data augmentation    
                   padding_mode='border', resize_method=ResizeMethod.PAD)
        .databunch(bs=8) # Setting your batch size. 
        .normalize(imagenet_stats)) # Normalizing the data to help the model converging faster

In [7]:
# This function reshapes the output tensor to (in our case) a 21x2 tensor, witch corresponds to the 21 hand coordinates   
class Reshape(nn.Module):
    def __init__(self, *args):
        super(Reshape, self).__init__()
        self.shape = args

    def forward(self, x):
        return x.view(self.shape) 

In [9]:
# removing the classification head and setting a custom head for regression:
# A basic Flattening and downsizing to get the 21 final coordinates. 
head_reg = nn.Sequential(
    Flatten(), 
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(512*7*7, 256),
    nn.ReLU(),
    nn.BatchNorm1d(256),
    nn.Dropout(0.5),
    nn.Linear(256, 42),
    Reshape(-1,21,2),
    nn.Tanh())            # I added the Tanh function to keep the output in [-1,1] range to help the model converge faster.
                          # you can remove it, it won't prevent the model from converging  

In [10]:
#Setting a custom loss function using MSELoss class. We need to flatten the output tensor to be able 
# to compare it to the target (hence .view(-1)).  
class MSELossFlat(nn.MSELoss):
    def forward(self, input:Tensor, target:Tensor):
     return super().forward(input.view(-1), target.view(-1)) 

mse_loss_flat = MSELossFlat()    #very important!!:initialazing the class

learn = cnn_learner(data, models.resnet34,custom_head=head_reg, loss_func=mse_loss_flat) 