In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from torchvision import transforms, datasets, models
from torchvision.datasets import ImageFolder, DatasetFolder
from torch.utils.data import random_split, DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
import mediapipe as mp
import numpy as np
import cv2 as cv2

In [22]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [23]:
# import zipfile
# with zipfile.ZipFile("train.zip","r") as zip_ref:
#     zip_ref.extractall("train")

In [24]:
train_dir = 'train/asl_alphabet_train/asl_alphabet_train'
test_dir = 'test/asl-alphabet-test'

In [25]:
STANDARD_HEIGHT = 200
STANDARD_WIDTH = 200
MIN_CONFIDENCE_LEVEL = 0.7

class MediaPipe(object):
    def __call__(self, sample):
        image = np.array(sample)
        mp_hands = mp.solutions.hands
        with mp_hands.Hands(static_image_mode = True,max_num_hands = 2,
            min_detection_confidence = MIN_CONFIDENCE_LEVEL) as hands:

            #For training change this line, don't need to flip (since images appear to be from back-facing camera) 
            #Convert cv2 BGR image to RGB image and flip (since image coming from front-facing camera)  
            processed = hands.process(cv2.flip(image, 1))

            #No hand detected (Figure out how we want to handle, 126 vector with all 0s?): 
            if not processed.multi_hand_landmarks: 
                zeros = torch.tensor(np.array([0] * 126), dtype=torch.float32)
                return zeros

            feature_vector = [] 
            #Could have one or two hands: 
            for hand in processed.multi_hand_landmarks: 
                for curr_landmark in hand.landmark: 
                    x = curr_landmark.x 
                    feature_vector.append(x)

                    y = curr_landmark.y 
                    feature_vector.append(y)

                    z = curr_landmark.z
                    feature_vector.append(z)

            #If we have just one hand, zero out the remaining (to ensure constant vector size of 126)
            #Might cause problems in one-hand case if we care which hand is visible/showing sign language
            #Solution to this is to use processed.multi_handedness
            if (len(feature_vector) == 63):
                zero_vector = [0] * 63 
                feature_vector.extend(zero_vector)
            
            output = torch.tensor(np.array(feature_vector), dtype=torch.float32)

            return output

class ExtractHandFeatures: 
    def __call__(self, sample):
        image = np.array(sample)
        mp_hands = mp.solutions.hands
        with mp_hands.Hands(static_image_mode = True,max_num_hands = 2,
            min_detection_confidence = MIN_CONFIDENCE_LEVEL) as hands:
            
            #For training change this line, don't need to flip (since images appear to be from back-facing camera) 
            #Convert cv2 BGR image to RGB image and flip (since image coming from front-facing camera)  
            processed = hands.process(cv2.flip(cv2.cvtColor(image, cv2.COLOR_BGR2RGB), 1))
            # processed = hands.process(cv2.cvtColor(self.raw_image, cv2.COLOR_BGR2RGB)) 

            #No hand detected (Figure out how we want to handle, 126 vector with all 0s?): 
            if not processed.multi_hand_landmarks: 
                zeros = torch.tensor(np.array([0] * 126), dtype=torch.float32)
                return zeros
            
            feature_vector = []         
            hands = [] 

            for idx, hand_handedness in enumerate(processed.multi_handedness):
                hands.append(hand_handedness.classification[0].label)
                

            #Left hand is first 63, Right hand is last 63
            #LEFT HAND ONLY CASE: 
            if (len(hands) == 1 and hands[0] == "Left"):
                for hand in processed.multi_hand_landmarks: 
                    for curr_landmark in hand.landmark: 
                        x = curr_landmark.x 
                        feature_vector.append(x)

                        y = curr_landmark.y 
                        feature_vector.append(y)

                        z = curr_landmark.z
                        feature_vector.append(z)
                zero_vector = [0] * 63 
                feature_vector.extend(zero_vector)

            #RIGHT HAND ONLY CASE: 
            if (len(hands) == 1 and hands[0] == "Right"):
                # print("Detected only right hand")
                for hand in processed.multi_hand_landmarks: 
                    for curr_landmark in hand.landmark: 
                        x = curr_landmark.x 
                        feature_vector.append(x)

                        y = curr_landmark.y 
                        feature_vector.append(y)

                        z = curr_landmark.z
                        feature_vector.append(z)
                zero_vector = [0] * 63 
                feature_vector = zero_vector + feature_vector
            
            #BOTH HANDS CASE: 
            if (len(hands) == 2):
                # print("Detected both hands")
                zeros = torch.tensor(np.array([0] * 126), dtype=torch.float32)
                return zeros

            output = torch.tensor(np.array(feature_vector), dtype=torch.float32)
            #print(output)
            return output


In [26]:
transform = transforms.Compose([
    ExtractHandFeatures(),
])
test_transform = transforms.Compose([
    transforms.Resize((200,200)),
    ExtractHandFeatures(),
])
dataset = ImageFolder(train_dir, transform=transform)
test = ImageFolder(test_dir, transform=test_transform)
# dataset = DatasetFolder(train_dir)
# test = DatasetFolder(test_dir)

In [27]:
dataset[6]

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.]),
 0)

In [28]:
dataset_len = len(dataset)
train_len_proportion = 0.9
train_len = int(train_len_proportion * dataset_len)
val_len = dataset_len - train_len
train_dataset, val_dataset = random_split(dataset, [train_len, val_len])

In [29]:
len(train_dataset), len(val_dataset)

(78300, 8700)

In [30]:
batch_size = 50
train_dl = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_dl = DataLoader(val_dataset, batch_size*2, num_workers=2, pin_memory=True)

In [31]:
classes = dataset.classes
print(len(classes))

29


In [32]:
# iter_data = iter(train_dl)
# fig, axes = plt.subplots(figsize=(12, 12), ncols=5)
# for i in range(5):
#     img, label = next(iter_data)
#     ax = axes[i]
#     ax.imshow(img[0].permute(1, 2, 0))
#     ax.title.set_text(''.join('%5s' % classes[label[0]]))
# plt.show()

In [33]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)
            
    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [34]:
device = get_default_device()
device

device(type='cuda')

In [35]:
train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(val_dl, device)

In [36]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

In [37]:
class ImageClassificationBase(nn.Module):
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(epoch, result['val_loss'], result['val_acc']))

In [38]:
class ASLResnet(ImageClassificationBase):
    def __init__(self):
        super().__init__()        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(126, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(p=0.25),
            nn.Linear(128, 29),            
        )
        
        self.network = self.linear_relu_stack
    
    def forward(self, xb):
        return self.network(xb)
    
    def freeze(self):
        # To freeze the residual layers
        for param in self.network.parameters():
            param.require_grad = False
#         for param in self.network.fc.parameters():
#             param.require_grad = True
    
    def unfreeze(self):
        # Unfreeze all layers
        for param in self.network.parameters():
            param.require_grad = True

In [39]:
model = to_device(ASLResnet(), device)
model

ASLResnet(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=126, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=256, out_features=256, bias=True)
    (7): ReLU()
    (8): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): Linear(in_features=256, out_features=128, bias=True)
    (10): ReLU()
    (11): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): Dropout(p=0.25, inplace=False)
    (13): Linear(in_features=128, out_features=29, bias=True)
  )
  (network): Sequential(
    (0): Linear(in_features=126, out_features=256, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_

In [40]:
from tqdm import tqdm
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in tqdm(val_loader)]
    return model.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def fit_one_cycle(epochs, max_lr, model, train_loader, val_loader, 
                  weight_decay=0, grad_clip=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []
    
    # Set up custom optimizer with weight decay
    optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    # Set up one-cycle learning rate scheduler
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs, 
                                                steps_per_epoch=len(train_loader))
    
    for epoch in range(epochs):
        # Training Phase 
        model.train()
        train_losses = []
        lrs = []
        for batch in tqdm(train_loader):
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            
            # Gradient clipping
            if grad_clip: 
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            
            optimizer.step()
            optimizer.zero_grad()
            
            # Record & update learning rate
            lrs.append(get_lr(optimizer))
            sched.step()
        
        # Validation phase
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        result['lrs'] = lrs
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [41]:
history = [evaluate(model, valid_dl)]
history

100%|███████████████████████████████████████████| 87/87 [09:59<00:00,  6.89s/it]


[{'val_loss': 3.368922472000122, 'val_acc': 0.03379310294985771}]

In [42]:
# model.freeze()
epochs = 5
max_lr = 1e-4
grad_clip = 0.1
weight_decay = 1e-4
opt_func = torch.optim.Adam

In [None]:
%%time
history += fit_one_cycle(epochs, max_lr, model, train_dl, valid_dl, 
                             grad_clip=grad_clip, 
                             weight_decay=weight_decay, 
                             opt_func=opt_func)

100%|█████████████████████████████████████| 1566/1566 [1:29:47<00:00,  3.44s/it]
100%|███████████████████████████████████████████| 87/87 [09:49<00:00,  6.77s/it]


Epoch [0], val_loss: 3.3651, val_acc: 0.0990


100%|█████████████████████████████████████| 1566/1566 [1:27:41<00:00,  3.36s/it]
100%|███████████████████████████████████████████| 87/87 [09:57<00:00,  6.87s/it]


Epoch [1], val_loss: 3.2079, val_acc: 0.1244


100%|█████████████████████████████████████| 1566/1566 [1:28:20<00:00,  3.38s/it]
100%|███████████████████████████████████████████| 87/87 [10:06<00:00,  6.97s/it]


Epoch [2], val_loss: 3.0777, val_acc: 0.1330


100%|█████████████████████████████████████| 1566/1566 [1:27:16<00:00,  3.34s/it]
100%|███████████████████████████████████████████| 87/87 [09:54<00:00,  6.83s/it]


Epoch [3], val_loss: 3.0248, val_acc: 0.1402


 79%|█████████████████████████████▎       | 1239/1566 [1:09:19<16:46,  3.08s/it]

In [None]:
torch.save(model.state_dict(), 'asl-colored-extracthand-mvp3.pth')

In [None]:
### Test
def predict_image(img, model):
    # Convert to a batch of 1
    xb = to_device(img.unsqueeze(0), device)
    # Get predictions from model
    yb = model(xb)
    # Pick index with highest probability
    _, preds  = torch.max(yb, dim=1)
    # Retrieve the class label
    return dataset.classes[preds[0].item()]
len(test)
test_dl = DataLoader(test, batch_size, num_workers=4, pin_memory=True)
test_dl = DeviceDataLoader(test_dl, device)
evaluate(model, test_dl)

In [None]:
# Sanity Check
model2 = to_device(ASLResnet(), device)
model2.load_state_dict(torch.load('asl-colored-extracthand-mvp3.pth'))
evaluate(model2, valid_dl)
