# Hand Gesture Detector with CNN

In [None]:
from google.colab import drive
drive.mount('/gdrive')

root = '/gdrive/My Drive/MotionDetection'

## Import

In [None]:
from __future__ import unicode_literals, print_function, division
import sys

import numpy
import pickle
import torch
import itertools
import time
import math
from torch.utils.data import Dataset, DataLoader

import csv
from pathlib import Path
import pandas as pd
import random
from sklearn.metrics import confusion_matrix

## Functions for Data Preprocessing

In [None]:
def normalize(tens):
  """ Normalize 2 dimensional Tensor: number of joints * xy-coordinate
      Maintain the ratio and make sure the joints are in 100*100 box."""
  max_x = max(tens[:, 0])
  min_x = min(tens[:, 0])

  max_y = max(tens[:, 1])
  min_y = min(tens[:, 1])

  tens[:, 0] = (tens[:, 0] - min_x + 1) / (max_y - min_y) * 100
  tens[:, 1] = (tens[:, 1] - min_y + 1) / (max_y - min_y) * 100

  return tens


# import matplotlib.pyplot as plt

# test1 = torch.rand(1, 2, 21, 2)
# test1[:, :, :, 0] = 12 * test1[:, :, :, 0]
# test1[:, :, :, 1] = 10 * test1[:, :, :, 1]

In [None]:
def randn_ordered(tens, n):
  """Random sample n from a tensor, maintaining the order"""
  assert tens.size(0) > n
  print(type(tens.shape[0]))
  r = random.sample(range(tens.shape[0]), n)
  r.sort()
  
  return tens[r]
  
# random.seed(a=123)
# test1 = torch.arange(3*4*5)
# test1 = test1.reshape(5, 4, 3)
# 
# # print(test1)
# # print(randn_ordered(test1, 3))

In [None]:
def cut_n(tens, n):
  """cut n frames in the middle(0.6) from the sequence"""
  if tens.shape[0] == n:
    return tens

  tt = int(tens.shape[0] * 0.6) - 1
  c = tens[tt-n//2:tt+(n-n//2)]
  if c.shape[0] != n:
    return None
  return tens[tt-n//2:tt+(n-n//2)]

# test1 = torch.arange(10*5*3)
# test1 = test1.reshape(10, 5, 3)
# 
# print(test1)
# print(cut_n(test1, 2))
# test2= torch.tensor([[5.8782611e+02, 5.1130432e+02, 3.5325505e-02],  [5.8304352e+02, 4.3956522e+02, 2.4411943e-02],  [4.8260870e+02, 5.2804346e+02, 2.0107470e-02]])
# print(cut_n(test2,1))

In [None]:
def cut_max2(tens):
  """ Find maximum confidence frame and return two frames(maximum and one of its neighbor)"""
  # print(torch.sum(tens[:, :, -1], dim=1))
  tt = torch.argmax(torch.sum(tens[:, :, -1], dim=1), dim=0)
  # return torch.unsqueeze(tens[tt], 0)
  # print(tt)
  if tt == 0:
    return tens[:2]
  if tt == tens.shape[0]-1:
    return tens[-2:]
  if torch.sum(tens[tt-1, :, -1]) >= torch.sum(tens[tt+1, :, -1]):
    return tens[tt-1:tt+1]
  return tens[tt:tt+2]

# test1 = torch.rand(5*21*3)
# test1 = test1.reshape(5, 21, 3)


# print(test1)
# print(cut_max2(test1))
# test2= torch.tensor([[5.8782611e+02, 5.1130432e+02, 3.5325505e-02],  [5.8304352e+02, 4.3956522e+02, 2.4411943e-02],  [4.8260870e+02, 5.2804346e+02, 2.0107470e-02]])
# print(cut_max2(test2,1))

In [None]:
def cut_max3(tens):
  """ Find maximum confidence frame and return two frames(maximum and two of its neighbor)"""
  # print(torch.sum(tens[:, :, -1], dim=1))
  tt = torch.argmax(torch.sum(tens[:, :, -1], dim=1), dim=0)
  if tt == 0:
    return tens[:3]
  if tt == tens.shape[0]-1:
    return tens[-3:]
  return tens[tt-1:tt+2]
# test = torch.rand(5*21*3)
# test = test.reshape(5, 21, 3)


# print(test)
# print(cut_max2(test))
# test2= torch.tensor([[5.8782611e+02, 5.1130432e+02, 3.5325505e-02],  [5.8304352e+02, 4.3956522e+02, 2.4411943e-02],  [4.8260870e+02, 5.2804346e+02, 2.0107470e-02]])
# print(cut_max2(test2,1))

In [None]:
def resample_n(tens, n):
  """ Resample n frames from a tensor
      Example) "1, 2, 3, 4, 5", 13 --> "1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5" """
  # origin_len = tens.shape[0]
  origin_len = len(tens)
  quo = n // origin_len
  rem = origin_len - n % origin_len
  tar_l = [i for i in range(rem // 2 + rem % 2)]
  tar_r = [origin_len - i-1 for i in range(rem // 2)]
  tar = tar_l+tar_r
  result = []
  for i in range(origin_len):
    if i in tar:
      result += [tens[i]] * quo
    else:
      result += [tens[i]] * (quo+1)
  result = torch.stack(result, dim = 0)
  return result

# test1 = torch.arange(13*2*2)
# test1 = test1.reshape(13, 2, 2)
# print(test1)
# print(resample_n(test1, 30))


## Load Dataset

In [None]:
def parser(line):
  """ Our data is in csv format. Parse each line of the csv"""
  id, left, right = line.split('[[[')
  id = id[:-2]
  left = '[' + left[:-4]
  right = '[' + right[:-2]

  left = left.split(',  ')
  left = [i.strip('[]').split(' ') for i in left]
  left = [list(filter(None, leftelem)) for leftelem in left]
  left = [float(i) for j in left for i in j]
  left = torch.tensor(left)
  left = left.reshape((-1, 3))
  left_avg = sum(left[:, 2])/21
  
  right = right.split(',  ')
  right = [i.strip('[]').split(' ') for i in right]
  right = [list(filter(None, rightelem)) for rightelem in right]
  right = [float(i) for j in right for i in j]
  right = torch.tensor(right)
  right = left.reshape((-1, 3))
  right_avg = sum(right[:, 2])/21

  v_id, img_id = id.split('_')

  if left_avg >= right_avg:
    left[:,0] = -left[:,0]
    return v_id, img_id, normalize(left)
    # return v_id, img_id, left
  return v_id, img_id, normalize(right)
  # return v_id, img_id, right
  #   return v_id, img_id, normalize(left[:, :2])
  # return v_id, img_id, normalize(right[:, :2])


# v_id, img_id, hand = parser('1_00001,"[[[5.8782611e+02 5.1130432e+02 3.5325505e-02],  [5.8304352e+02 4.3956522e+02 2.4411943e-02],  [4.8260870e+02 5.2804346e+02 2.0107470e-02],  [4.6108698e+02 5.3043475e+02 2.2252293e-02],  [4.7543481e+02 5.6391302e+02 2.0914197e-02],  [4.7543481e+02 5.2804346e+02 2.7845293e-02],  [4.6586960e+02 5.4000000e+02 2.5432626e-02],  [5.1847827e+02 5.4956519e+02 1.6933031e-02],  [4.8021741e+02 5.6391302e+02 1.2440545e-02],  [5.3282611e+02 4.5630432e+02 1.2662101e-02],  [5.0652176e+02 5.4000000e+02 2.5466656e-02],  [5.1130435e+02 5.2565216e+02 2.5591202e-02],  [5.0891306e+02 5.4956519e+02 1.0660242e-02],  [4.7065219e+02 5.2565216e+02 1.6639609e-02],  [5.2804352e+02 5.0173911e+02 2.2738149e-02],  [5.0413046e+02 5.3043475e+02 2.0388147e-02],  [4.8021741e+02 5.5434778e+02 1.0172251e-02],  [4.5152176e+02 4.3239130e+02 1.6042734e-02],  [4.8500003e+02 5.4956519e+02 1.4421927e-02],  [4.9217392e+02 5.6152173e+02 1.5104427e-02],  [4.8978262e+02 5.6152173e+02 1.0827912e-02]]]","[[[4.8782608e+02 4.5434781e+02 4.6337242e-03],  [4.8065216e+02 4.9021738e+02 8.7779146e-03],  [4.8782608e+02 5.2847821e+02 8.5121123e-03],  [4.7108694e+02 4.6869565e+02 8.6766174e-03],  [4.6869565e+02 4.6630432e+02 7.1836482e-03],  [5.4043475e+02 4.2565216e+02 8.7995632e-03],  [4.2804346e+02 4.8782608e+02 8.3582215e-03],  [3.4913043e+02 5.1173911e+02 1.0578417e-02],  [4.0891302e+02 4.8782608e+02 9.7930310e-03],  [5.2847821e+02 4.1130432e+02 5.6122812e-03],  [4.2804346e+02 4.8782608e+02 8.1787128e-03],  [4.3043475e+02 4.9499997e+02 1.0875644e-02],  [4.1608694e+02 4.9021738e+02 9.1631478e-03],  [4.1608694e+02 4.8782608e+02 8.3672330e-03],  [4.2804346e+02 4.8782608e+02 9.0226326e-03],  [4.2565216e+02 5.0217389e+02 9.3891509e-03],  [4.1847824e+02 5.0934781e+02 8.6548291e-03],  [5.3086957e+02 4.2565216e+02 1.0162307e-02],  [3.5391302e+02 5.2369562e+02 1.0465117e-02],  [3.5391302e+02 5.2369562e+02 1.5369176e-02],  [3.6347824e+02 5.2130432e+02 8.9091975e-03]]]"')

# print(v_id, img_id)
# print(hand)

In [None]:
import random
import datetime

def load_data():
  """ Load data from the csv files."""
  result_dir = Path(root) / 'OpenPose.csv'
  csv_list = ['result_Gesture1.csv','result_Gesture2.csv','result_Gesture3.csv','result_Gesture4.csv','result_Gesture5.csv']
  train = []
  test = []
  for i in range(5):
    print("reading start for", i, "th csv!", datetime.datetime.now().time())
    result_dir = Path(root) / csv_list[i]
    data_csv = open(str(result_dir), 'r')
    data_lines = data_csv.readlines()[1:]

    dataset =  dict() # create empty dictionary
    v_ids = set()
    
    for line in data_lines: 
      v_id, img_id, hand = parser(line) # parse each data line. l, r contains joint position of a image.

      if v_id in v_ids: # if v_id already exists, append the joints to existing video id.
        dataset[v_id].append(hand)
      else: # if not, start new sequence.
        dataset[v_id] = [hand]
        v_ids.add(v_id)
    tot = list(dataset.values())
    totlen = len(tot)
    lens = [len(v) for v in dataset.values()]
    print(set(lens))
    for v in tot[:int(totlen*0.8)]:
      # x = cut_n(torch.stack(v), 12)
      x = resample_n(torch.stack(v), 100)
      if x is None:
        continue
      # x_ = torch.stack([x[i+1] - x[i] for i in range(x.shape[0]-1)])
      # x = torch.cat((x, x_))
      train.append((x, i))

    for v in tot[int(totlen*0.8):]:
      # x = cut_n(torch.stack(v), 12)
      x = resample_n(torch.stack(v), 100)
      if x is None:
        continue
      # x_ = torch.stack([x[i+1] - x[i] for i in range(x.shape[0]-1)])
      # x = torch.cat((x, x_))
      test.append((x, i))

  random.shuffle(train)
  random.shuffle(test)

  return train, test

train, test = load_data()

In [None]:
class HandposeDataset(Dataset):
  """ handpose dataset """
  def __init__(self, dataset):
    self.len = len(dataset)
    # print(dataset[0][0].size(), dataset[0][0][:,:,:-1].size())
    print(dataset[0][0].shape)
    x = [i[0][:,:,:-1] for i in dataset] # without confidence
    print(x[0].size())
    # x = [i[0].view(i[0].size()[0], -1) for i in dataset] # with confidence
    print(len(x), "len of x")
    x_data = torch.stack(x)
    self.x_data = x_data.view(x_data.size()[0], x_data.size()[1], -1)
    y = [i[1] for i in dataset]
    print(len(y), "len of y")
    self.y_data = torch.tensor(y)
    print('x_data', self.x_data.shape)
    print('y_data', self.y_data.shape)
  
  def __getitem__(self, index):
    return self.x_data[index], self.y_data[index]

  def __len__(self):
    return self.len

In [None]:
hand_train = HandposeDataset(dataset = train)
hand_test = HandposeDataset(dataset = test)

train_loader = DataLoader(dataset = hand_train,
                          batch_size=32,
                          shuffle = True,
                          num_workers = 4)
test_loader = DataLoader(dataset = hand_test,
                          batch_size=32,
                          shuffle = True,
                          num_workers = 4)

## Model

In [None]:
n_classes = 5
duration = 100
n_channels = 42

In [None]:
class HandGestureNet(torch.nn.Module):
    """
    [Devineau et al., 2018] Deep Learning for Hand Gesture Recognition on Skeletal Data

    Summary
    -------
        Deep Learning Model for Hand Gesture classification using pose data only (no need for RGBD)
        The model computes a succession of [convolutions and pooling] over time independently on each of the 66 (= 22 * 3) sequence channels.
        Each of these computations are actually done at two different resolutions, that are later merged by concatenation
        with the (pooled) original sequence channel.
        Finally, a multi-layer perceptron merges all of the processed channels and outputs a classification.
    
    TL;DR:
    ------
        input ------------------------------------------------> split into n_channels channels [channel_i]
            channel_i ----------------------------------------> 3x [conv/pool/dropout] low_resolution_i
            channel_i ----------------------------------------> 3x [conv/pool/dropout] high_resolution_i
            channel_i ----------------------------------------> pooled_i
            low_resolution_i, high_resolution_i, pooled_i ----> output_channel_i
        MLP(n_channels x [output_channel_i]) -------------------------> classification

    Article / PDF:
    --------------
        https://ieeexplore.ieee.org/document/8373818

    Please cite:
    ------------
        @inproceedings{devineau2018deep,
            title={Deep learning for hand gesture recognition on skeletal data},
            author={Devineau, Guillaume and Moutarde, Fabien and Xi, Wang and Yang, Jie},
            booktitle={2018 13th IEEE International Conference on Automatic Face \& Gesture Recognition (FG 2018)},
            pages={106--113},
            year={2018},
            organization={IEEE}
        }
    """
    
    def __init__(self, n_channels=42, n_classes=5, dropout_probability=0.2):

        super(HandGestureNet, self).__init__()
        
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.dropout_probability = dropout_probability

        # Layers ----------------------------------------------
        self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3),
            # torch.nn.ReLU(),
            torch.nn.LeakyReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3),
            # torch.nn.ReLU(),
            torch.nn.LeakyReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3),
            # torch.nn.ReLU(),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1),
            # torch.nn.ReLU(),
            torch.nn.LeakyReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1),
            # torch.nn.ReLU(),
            torch.nn.LeakyReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1),
            # torch.nn.ReLU(),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.all_residual = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(in_features=9 * n_channels*12, out_features=1936),  # <-- 12: depends of the sequences lengths (cf. below)
            # torch.nn.Linear(in_features=9 * n_channels, out_features=1936),  # <-- 12: depends of the sequences lengths (cf. below)
            torch.nn.ReLU(),
            # torch.nn.LeakyReLU(),
            torch.nn.Linear(in_features=1936, out_features=n_classes)
        )

        # Initialization --------------------------------------
        # Xavier init
        for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual):
            for layer in module:
                if layer.__class__.__name__ == "Conv1d":
                    torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                    torch.nn.init.constant_(layer.bias, 0.1)

        for layer in self.fc:
            if layer.__class__.__name__ == "Linear":
                torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                torch.nn.init.constant_(layer.bias, 0.1)

    def forward(self, input):
        """
        This function performs the actual computations of the network for a forward pass.

        Arguments
        ---------
            input: a tensor of gestures of shape (batch_size, duration, n_channels)
                   (where n_channels = 3 * n_joints for 3D pose data)
        """

        # Work on each channel separately
        all_features = []

        for channel in range(0, self.n_channels):
            input_channel = input[:, :, channel]

            # Add a dummy (spatial) dimension for the time convolutions
            # Conv1D format : (batch_size, n_feature_maps, duration)
            input_channel = input_channel.unsqueeze(1)

            high = self.all_conv_high[channel](input_channel)
            low = self.all_conv_low[channel](input_channel)
            ap_residual = self.all_residual[channel](input_channel)

            # Time convolutions are concatenated along the feature maps axis
            output_channel = torch.cat([
                high,
                low,
                ap_residual
            ], dim=1)
            all_features.append(output_channel)

        # Concatenate along the feature maps axis
        all_features = torch.cat(all_features, dim=1)
        # Flatten for the Linear layers
        all_features = all_features.view(-1, 9 * self.n_channels*12)  # <-- 12: depends of the initial sequence length (100).
        # all_features = all_features.view(-1, 9 * self.n_channels)
        # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture:
        # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well!

        # Fully-Connected Layers
        output = self.fc(all_features)

        return output

## Training

In [None]:
def train_model(model, criterion, optimizer, trainloader, testloader, num_epochs=5, model_name='hello'):
    # Training starting time
    global_step = 0
    best_accuracy = 0
    start = time.time()
    ckpt_dir = Path(root)
    print('[INFO] Started to train the model.')
    print('Training the model on {}.'.format('GPU' if device == torch.device('cuda') else 'CPU'))
    for ep in range(num_epochs):
        model.train()
        current_loss = 0.0
        for idx_batch, batch in enumerate(trainloader):
            x, y = batch
            x, y = x.to(device), y.to(device)
            
            optimizer.zero_grad()
            y_pred = model(x)

            loss = criterion(y_pred, y)
            loss.backward() 
            optimizer.step()
            
            current_loss += loss.item()
        print('train loss:', current_loss)
        model.eval()

        nb_classes = 5
        confusion_matrix = torch.zeros(nb_classes, nb_classes)
        with torch.no_grad():
            test_loss = 0.
            test_accuracy = 0.
            test_num_data = 0.
            for batch_idx, (x, y) in enumerate(testloader):
                x = x.to(device=device)
                y = y.to(device=device)
                
                logit = model(x)
                pred = logit.argmax(dim = 1)
                accuracy = (pred == y).float().mean()
                loss = criterion(logit, y)

                test_loss += loss.item()*x.shape[0]
                test_accuracy += accuracy.item()*x.shape[0]
                test_num_data += x.shape[0]
                
                for t, p in zip(y.view(-1), pred.view(-1)):
                  confusion_matrix[t.long(), p.long()] += 1

            test_loss /= test_num_data
            test_accuracy /= test_num_data
            print('epoch:', ep, 'test loss:', test_loss, 'test_accuracy:', test_accuracy)
            # print(confusion_matrix)
            if test_accuracy > best_accuracy:
                best_accuracy = test_accuracy
                best_confusion = confusion_matrix
                torch.save(model.state_dict(), f'{ckpt_dir}/{model_name}.pt')
        scheduler.step()
    return best_confusion, best_accuracy

In [None]:
model = HandGestureNet(n_channels=42, n_classes=5)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0002)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size= 10, gamma = 0.5)

num_epochs = 100

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

confusion, accuracy = train_model(model=model, criterion=criterion, optimizer=optimizer, 
                                  trainloader=train_loader, testloader = test_loader, num_epochs=num_epochs, model_name='cnn_mid12')

print(f'Best test accuracy: {accuracy:.3f}')
print(confusion)