In [1]:
import os

import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn

In [2]:
max_length = 80
num_point  = 82

embed_dim  = 512
num_head   = 4
num_block  = 1

In [3]:
import numpy as np
import torch

num_class  = 250
num_landmark = 543

class HardSwish(nn.Module):
    def __init__(self,):
        super().__init__()
    def forward(self, x):
        return x * F.relu6(x+3) * 0.16666667

class FeedForward(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, embed_dim),
        )
    def forward(self, x):
        return self.mlp(x)

#https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
class MultiHeadAttention(nn.Module):
    def __init__(self,
            embed_dim,
            num_head,
            batch_first,
        ):
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim,
            num_heads=num_head,
            bias=True,
            add_bias_kv=False,
            kdim=None,
            vdim=None,
            dropout=0.0,
            batch_first=batch_first,
        )

    def forward(self, x, x_mask):
        out, _ = self.mha(x,x,x, key_padding_mask=x_mask)
        return out

class TransformerBlock(nn.Module):
    def __init__(self,
        embed_dim,
        num_head,
        out_dim,
        batch_first=True,
    ):
        super().__init__()
        self.attn  = MultiHeadAttention(embed_dim, num_head,batch_first)
        self.ffn   = FeedForward(embed_dim, out_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(out_dim)

    def forward(self, x, x_mask=None):
        x = x + self.attn((self.norm1(x)), x_mask)
        x = x + self.ffn((self.norm2(x)))
        return x


def positional_encoding(length, embed_dim):
    dim = embed_dim//2
    position = np.arange(length)[:, np.newaxis]     # (seq, 1)
    dim = np.arange(dim)[np.newaxis, :]/dim   # (1, dim)
    angle = 1 / (10000**dim)         # (1, dim)
    angle = position * angle    # (pos, dim)
    pos_embed = np.concatenate(
        [np.sin(angle), np.cos(angle)],
        axis=-1
    )
    pos_embed = torch.from_numpy(pos_embed).float()
    return pos_embed

def pack_seq(
    seq,
):
    length = [min(s.shape[0], max_length)  for s in seq]
    batch_size = len(seq)
    K = seq[0].shape[1]
    L = max(length)

    x = torch.zeros((batch_size, L, K, 3)).to(seq[0].device)
    x_mask = torch.zeros((batch_size, L)).to(seq[0].device)
    for b in range(batch_size):
        l = length[b]
        x[b, :l] = seq[b][:l]
        x_mask[b, l:] = 1
    x_mask = (x_mask>0.5)
    x = x.reshape(batch_size,-1,K*3)
    return x, x_mask

#########################################################################

class Net(nn.Module):

    def __init__(self, num_class=num_class):
        super().__init__()
        self.output_type = ['inference', 'loss']

        pos_embed = positional_encoding(max_length, embed_dim)
        # self.register_buffer('pos_embed', pos_embed)
        self.pos_embed = nn.Parameter(pos_embed)

        self.cls_embed = nn.Parameter(torch.zeros((1, embed_dim)))
        self.x_embed = nn.Sequential(
            nn.Linear(num_point * 3, embed_dim, bias=False),
        )

        self.encoder = nn.ModuleList([
            TransformerBlock(
                embed_dim,
                num_head,
                embed_dim,
            ) for i in range(num_block)
        ])
        self.logit = nn.Linear(embed_dim, num_class)

    def forward(self, batch):
        xyz = batch['xyz']
        x, x_mask = pack_seq(xyz)
        #print(x.shape, x_mask.shape)
        B,L,_ = x.shape
        x = self.x_embed(x)
        x = x + self.pos_embed[:L].unsqueeze(0)

        x = torch.cat([
            self.cls_embed.unsqueeze(0).repeat(B,1,1),
            x
        ],1)
        x_mask = torch.cat([
            torch.zeros(B,1).to(x_mask),
            x_mask
        ],1)


        #x = F.dropout(x,p=0.25,training=self.training)
        for block in self.encoder:
            x = block(x,x_mask)

        cls = x[:,0]
        cls = F.dropout(cls,p=0.4,training=self.training)
        logit = self.logit(cls)

        output = {}
        if 'loss' in self.output_type:
            output['label_loss'] = F.cross_entropy(logit, batch['label'])

        if 'inference' in self.output_type:
            output['sign'] = torch.softmax(logit,-1)

        return output





def run_check_net():

    length = [12,16,20,180]
    batch_size = len(length)
    xyz = [
        np.random.uniform(-1,1,(length[b],num_point,3)) for b in range(batch_size)
    ]
    #---
    batch = {
        'label' : torch.from_numpy( np.random.choice(250,(batch_size))).long(),
        'xyz' : [torch.from_numpy(x).float() for x in xyz]
    }

    net = Net()
    output = net(batch)


    #---

    print('batch')
    for k, v in batch.items():
        if k in ['label','x']:
            print(f'{k:>32} : {v.shape} ')
        if k=='xyz':
            print(f'{k:>32} : {v[0].shape} ')
            for i in range(1,len(v)):
                print(f'{" ":>32} : {v[i].shape} ')

    print('output')
    for k, v in output.items():
        if 'loss' not in k:
            print(f'{k:>32} : {v.shape} ')
    print('loss')
    for k, v in output.items():
        if 'loss' in k:
            print(f'{k:>32} : {v.item()} ')



# main #################################################################
if __name__ == '__main__':
    run_check_net()

batch
                           label : torch.Size([4]) 
                             xyz : torch.Size([12, 82, 3]) 
                                 : torch.Size([16, 82, 3]) 
                                 : torch.Size([20, 82, 3]) 
                                 : torch.Size([180, 82, 3]) 
output
                            sign : torch.Size([4, 250]) 
loss
                      label_loss : 5.658082962036133 


# Dataset

In [4]:
# additional helper functions
ROWS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

In [5]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedGroupKFold
from torch.utils.data import SequentialSampler, RandomSampler

def read_kaggle_csv_by_random(fold=0):
    num_fold = 5

    kaggle_df = pd.read_csv('/kaggle/input/asl-demo/train_prepared.csv')
    train_df = kaggle_df[kaggle_df.fold!=fold].reset_index(drop=True)
    valid_df = kaggle_df[kaggle_df.fold==fold].reset_index(drop=True)
    return train_df, valid_df

def read_kaggle_csv_by_part(fold=0):
    num_fold = 5

    kaggle_df = pd.read_csv('/kaggle/input/asl-demo/train_prepared.csv')
    kaggle_df.loc[:, 'fold' ] = -1

    sgkf = StratifiedGroupKFold(n_splits=num_fold, random_state=123, shuffle=True)
    for i, (train_index, valid_index) in enumerate(sgkf.split(kaggle_df.path, kaggle_df.label, kaggle_df.participant_id)):
        kaggle_df.loc[valid_index,'fold'] = i

    #kaggle_df.loc[:, 'fold'] = np.arange(len(kaggle_df))%num_fold
    train_df = kaggle_df[kaggle_df.fold!=fold].reset_index(drop=True)
    valid_df = kaggle_df[kaggle_df.fold==fold].reset_index(drop=True)
    return train_df, valid_df

def read_christ_csv_by_part(fold=0):
    kaggle_df = pd.read_csv('/kaggle/input/asl-demo/train_prepared.csv')
    christ_df = kaggle_df
    
    christ_df = christ_df.merge(kaggle_df[['path']], on='path',validate='1:1') # also kaggle_df['num_frame'] was there removed it
    valid_df = christ_df[christ_df.fold == fold].reset_index(drop=True)
    train_df = christ_df[christ_df.fold != fold].reset_index(drop=True)
    return train_df, valid_df


def pre_process(xyz):
    #xyz = xyz - xyz[~torch.isnan(xyz)].mean(0,keepdims=True) #noramlisation to common maen
    #xyz = xyz / xyz[~torch.isnan(xyz)].std(0, keepdims=True)
    
    LIP = [
            61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
            291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
            78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
            95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
        ]
    
    lip   = xyz[:, LIP]
    lhand = xyz[:, 468:489]
    rhand = xyz[:, 522:543]
    xyz = torch.cat([ #(none, 82, 3)
        lip,
        lhand,
        rhand,
    ],1)
    xyz[torch.isnan(xyz)] = 0
    xyz = xyz[:max_length]
    return xyz


#-----------------------------------------------------
def train_augment(xyz):
    xyz = do_random_affine(
        xyz,
        scale  = (0.7,1.3),
        shift  = (-0.08,0.08),
        degree = (-20,20),
        p=0.8
    )
    return xyz


class SignDataset(Dataset):
    def __init__(self, df, augment=None):
        self.df = df
        self.augment = augment
        self.length = len(self.df)

    def __str__(self):
        num_participant_id = self.df.participant_id.nunique()
        string = ''
        string += f'\tlen = {len(self)}\n'
        string += f'\tnum_participant_id = {num_participant_id}\n'
        return string

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        d = self.df.iloc[index]

        pq_file = f'/kaggle/input/asl-signs/{d.path}'
        xyz = load_relevant_data_subset(pq_file)
#         print(xyz)
        xyz = xyz - xyz[~np.isnan(xyz)].mean(0,keepdims=True) #noramlisation to common maen
        xyz = xyz / xyz[~np.isnan(xyz)].std(0, keepdims=True)

        #--
#         if self.augment is not None:
#             xyz = self.augment(xyz)
#         print(xyz)
        #--
        xyz = torch.from_numpy(xyz).float()
        xyz = pre_process(xyz)

        r = {}
        r['index'] = index
        r['d'    ] = d
        r['xyz'  ] = xyz
        r['label'] = d.label
        return r


tensor_key = ['xyz', 'label']
def null_collate(batch):
    batch_size = len(batch)
    d = {}
    key = batch[0].keys()
    for k in key:
        d[k] = [b[k] for b in batch]
    d['label'] = torch.LongTensor(d['label'])
    return d




#################################################################################

def run_check_dataset():

    train_df, valid_df = read_kaggle_csv_by_part(fold=0)
    dataset = SignDataset(valid_df)
    print(dataset)

    for i in range(12):
        r = dataset[i]
        print(r['index'], '--------------------')
        print(r["d"], '\n')
        for k in tensor_key:
            if k =='label': continue
            v = r[k]
            print(k)
            print('\t', 'dtype:', v.dtype)
            print('\t', 'shape:', v.shape)
            if len(v)!=0:
                print('\t', 'min/max:', v.min().item(),'/', v.max().item())
                print('\t', 'is_contiguous:', v.is_contiguous())
                print('\t', 'values:')
                print('\t\t', v.reshape(-1)[:5].data.numpy().tolist(), '...')
                print('\t\t', v.reshape(-1)[-5:].data.numpy().tolist())
        print('')
        if 0:
            #draw
            cv2.waitKey(1)



    loader = DataLoader(
        dataset,
        sampler=SequentialSampler(dataset),
        batch_size=8,
        drop_last=True,
        num_workers=0,
        pin_memory=False,
        worker_init_fn=lambda id: np.random.seed(torch.initial_seed() // 2 ** 32 + id),
        collate_fn=null_collate,
    )
    print(f'batch_size   : {loader.batch_size}')
    print(f'len(loader)  : {len(loader)}')
    print(f'len(dataset) : {len(dataset)}')
    print('')

    for t, batch in enumerate(loader):
        if t > 5: break
        print('batch ', t, '===================')
        print('index', batch['index'])

        for k in tensor_key:
            v = batch[k]

            if k =='label':
                print('label:')
                print('\t', v.data.numpy().tolist())

            if k =='x':
                print('x:')
                print('\t', v.data.shape)

            if k =='xyz':
                print('xyz:')
                for i in range(len(v)):
                    print('\t', v[i].shape)

        if 1:
            pass
        print('')


# main #################################################################
if __name__ == '__main__':
    run_check_dataset()

	len = 22959
	num_participant_id = 5

0 --------------------
path                      train_landmark_files/49445/1000397667.parquet
participant_id                                                    49445
sequence_id                                                  1000397667
sign                                                             vacuum
landmark_file_path    /home/user/Data/asl-signs/train_landmark_files...
npy_file_path         /home/user/Data/asl-signs/train_features_npy_f...
label                                                               231
fold                                                                  0
Name: 0, dtype: object 

xyz
	 dtype: torch.float32
	 shape: torch.Size([33, 82, 3])
	 min/max: -1.4137358665466309 / 1.5335360765457153
	 is_contiguous: True
	 values:
		 [0.4290144741535187, 0.43524935841560364, -0.8092206120491028, 0.4351654648780823, 0.4245232045650482] ...
		 [1.5183755159378052, -0.9306944608688354, 0.30079182982444763, 1.5102177858352661

# Train

In [6]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES']='0'

In [7]:
# additional helper functions 2
#assum zero-mean one-std, input
def do_random_affine(xyz,
    scale  = (0.8,1.5),
    shift  = (-0.1,0.1),
    degree = (-15,15),
    p=0.5
):
    if np.random.rand()<p:
        if scale is not None:
            scale = np.random.uniform(*scale)
            xyz = scale*xyz

        if shift is not None:
            shift = np.random.uniform(*shift)
            xyz = xyz + shift

        if degree is not None:
            degree = np.random.uniform(*degree)
            radian = degree/180*np.pi
            c = np.cos(radian)
            s = np.sin(radian)
            rotate = np.array([
                [c,-s],
                [s, c],
            ]).T
            xyz[...,:2] = xyz[...,:2] @rotate
            
def get_learning_rate(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
def time_to_str(t, mode='min'):
    if mode=='min':
        t  = int(t)/60
        hr = t//60
        min = t%60
        return '%2d hr %02d min'%(hr,min)

    elif mode=='sec':
        t   = int(t)
        min = t//60
        sec = t%60
        return '%2d min %02d sec'%(min,sec)

    else:
        raise NotImplementedError


### RAdam and Lookahead

In [8]:
import torch
from torch.optim import Optimizer
from collections import defaultdict
import math
from timeit import default_timer as timer

# # class RAdam(Optimizer):
# #     r"""Implements RAdam algorithm.
# #     It has been proposed in `ON THE VARIANCE OF THE ADAPTIVE LEARNING
# #     RATE AND BEYOND(https://arxiv.org/pdf/1908.03265.pdf)`_.
    
# #     Arguments:
# #         params (iterable):      iterable of parameters to optimize or dicts defining
# #                                 parameter groups
# #         lr (float, optional):   learning rate (default: 1e-3)
# #         betas (Tuple[float, float], optional):  coefficients used for computing
# #                                                 running averages of gradient and 
# #                                                 its square (default: (0.9, 0.999))
# #         eps (float, optional):  term added to the denominator to improve
# #                                 numerical stability (default: 1e-8)
# #         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
# #         amsgrad (boolean, optional):    whether to use the AMSGrad variant of this
# #                                         algorithm from the paper `On the Convergence 
# #                                         of Adam and Beyond`_(default: False)
        
# #         sma_thresh:             simple moving average threshold.
# #                                 Length till where the variance of adaptive lr is intracable.
# #                                 Default: 4 (as per paper)
# #     """
# #     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
# #                  weight_decay=0, amsgrad=False, sma_thresh=4):
# #         if not 0.0 <= lr:
# #             raise ValueError("Invalid learning rate: {}".format(lr))
# #         if not 0.0 <= eps:
# #             raise ValueError("Invalid epsilon value: {}".format(eps))
# #         if not 0.0 <= betas[0] < 1.0:
# #             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
# #         if not 0.0 <= betas[1] < 1.0:
# #             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
# #         defaults = dict(lr=lr, betas=betas, eps=eps,
# #                         weight_decay=weight_decay, amsgrad=amsgrad)
# #         super(RAdam, self).__init__(params, defaults)

# #         self.radam_buffer = [[None, None, None] for ind in range(10)]
# #         self.sma_thresh = sma_thresh

# #     def __setstate__(self, state):
# #         super(RAdam, self).__setstate__(state)
# #         for group in self.param_groups:
# #             group.setdefault('amsgrad', False)

# #     def step(self, closure=None):
# #         loss = None
# #         if closure is not None:
# #             loss = closure()

# #         for group in self.param_groups:
# #             for p in group['params']:
# #                 if p.grad is None:
# #                     continue

# #                 # Perform optimization step
# #                 grad = p.grad.data
# #                 if grad.is_sparse:
# #                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
# #                 amsgrad = group['amsgrad']

# #                 state = self.state[p]

# #                 # State initialization
# #                 if len(state) == 0:
# #                     state['step'] = 0
# #                     # Exponential moving average of gradient values
# #                     state['exp_avg'] = torch.zeros_like(p.data)
# #                     # Exponential moving average of squared gradient values
# #                     state['exp_avg_sq'] = torch.zeros_like(p.data)
# #                     if amsgrad:
# #                         # Maintains max of all exp. moving avg. of sq. grad. values
# #                         state['max_exp_avg_sq'] = torch.zeros_like(p.data)

# #                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
# #                 if amsgrad:
# #                     max_exp_avg_sq = state['max_exp_avg_sq']
# #                 beta1, beta2 = group['betas']

# #                 state['step'] += 1
# #                 old = p.data.float()

# #                 # Decay the first and second moment running average coefficient
# #                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
# #                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

# #                 buffer = self.radam_buffer[int(state['step']%10)]

# #                 if buffer[0] == state['step']:
# #                     sma_t, step_size = buffer[1], buffer[2]
# #                 else:                 
# #                     sma_max_len = 2/(1-beta2) - 1  
# #                     beta2_t = beta2 ** state['step']
# #                     sma_t = sma_max_len - 2 * state['step'] * beta2_t /(1 - beta2_t)
# #                     buffer[0] = state['step']
# #                     buffer[1] = sma_t

# #                     if sma_t > self.sma_thresh :
# #                         rt = math.sqrt(((sma_t - 4) * (sma_t - 2) * sma_max_len)/((sma_max_len -4) * (sma_max_len - 2) * sma_t))
# #                         step_size = group['lr'] * rt * math.sqrt((1 - beta2_t)) / (1 - beta1 ** state['step'])                      
# #                     else:
# #                         step_size = group['lr'] / (1 - beta1 ** state['step'])                        
# #                     buffer[2] = step_size

# #                 if group['weight_decay'] != 0:
# #                     p.data.add_(-group['weight_decay'] * group['lr'], old)

# #                 if sma_t > self.sma_thresh :
# #                     denom = exp_avg_sq.sqrt().add_(group['eps'])
# #                     p.data.addcdiv_(-step_size, exp_avg, denom)
# #                 else:
# #                     p.data.add_(-step_size, exp_avg)

# #         return loss

# class Lookahead(Optimizer):
#     r'''Implements Lookahead optimizer.

#     It's been proposed in paper: Lookahead Optimizer: k steps forward, 1 step back
#     (https://arxiv.org/pdf/1907.08610.pdf)

#     Args:
#         optimizer: The optimizer object used in inner loop for fast weight updates.
#         alpha:     The learning rate for slow weight update.
#                    Default: 0.5
#         k:         Number of iterations of fast weights updates before updating slow
#                    weights.
#                    Default: 5

#     Example:
#         > optim = Lookahead(optimizer)
#         > optim = Lookahead(optimizer, alpha=0.6, k=10)
#     '''
#     def __init__(self, optimizer, alpha=0.5, k=5):
#         assert(0.0 <= alpha <= 1.0)
#         assert(k >= 1)
#         self.optimizer = optimizer
#         self.alpha = alpha
#         self.k = k
#         self.k_counter = 0
#         self.param_groups = self.optimizer.param_groups
#         self.state = defaultdict(dict)
#         self.slow_weights = [[param.clone().detach() for param in group['params']] for group in self.param_groups]
    
#     def step(self, closure=None):
#         loss = self.optimizer.step(closure)
#         self.k_counter += 1
#         if self.k_counter >= self.k:
#             for group, slow_weight in zip(self.param_groups, self.slow_weights):
#                 for param, weight in zip(group['params'], slow_weight):
#                     weight.data.add_(self.alpha, (param.data - weight.data))
#                     param.data.copy_(weight.data)
#             self.k_counter = 0
#         return loss

#     def __getstate__(self):
#         return {
#             'state': self.state,
#             'optimizer': self.optimizer,
#             'alpha': self.alpha,
#             'k': self.k,
#             'k_counter': self.k_counter
#         }

#     def state_dict(self):
#         return self.optimizer.state_dict()

#     def load_state_dict(self, state_dict):
#         self.optimizer.load_state_dict(state_dict)

In [9]:
def np_cross_entropy(probability, truth):
    p = np.clip(probability,1e-4,1-1e-4)
    logp = -np.log(p)
    loss = logp[np.arange(len(logp)),truth]
    loss = loss.mean()
    return loss

def do_valid(net, valid_loader, iteration):

    valid_num = 0
    valid_sign = []
    valid_loss = 0

    net = net.eval()
    start_timer = timer()
    for t, batch in enumerate(valid_loader):
    
        net.output_type = ['inference']
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled = True):

                batch_size = len(batch['index'])
                batch['xyz'] = [xyz.cuda() for xyz in batch['xyz']]
                output = net(batch) #data_parallel(net, batch) #

        valid_sign.append(output['sign'].cpu().numpy())
        valid_num += batch_size

        #---
        print('\r %8d / %d  %s'%(valid_num, len(valid_loader.dataset),time_to_str(timer() - start_timer,'sec')),end='',flush=True)
        #if valid_num==200*4: break

    #print('')
    assert(valid_num == len(valid_loader.dataset))
    #------
    truth = valid_loader.dataset.df.label.values
    sign = np.concatenate(valid_sign)
    predict = np.argsort(-sign, -1)
    correct = predict==truth.reshape(valid_num,1)
    topk = correct.cumsum(-1).mean(0)[:5]

    loss = np_cross_entropy(sign, truth)

    return [loss, topk[0], topk[1],  topk[4]]


In [10]:
from torch.nn.parallel.data_parallel import data_parallel
from torch.optim import RAdam, AdamW

fold_type = 'kaggle-part'
fold_type = 'christ-part'

fold     = 2
out_dir  = '/kaggle/input' + '/result/run10/transfomer-80-256-lip-hand-3a'
fold_dir = out_dir+ f'/fold-{fold}-{fold_type}'

#https://www.kaggle.com/competitions/asl-signs/discussion/391203
def run_train():
    start_lr   = 1e-4 #0.0001
    batch_size = 64    #6 #32
    skip_save_epoch = 0
    num_epoch = 200
    

# log = Logger()
# log.open(fold_dir+'/log.train.txt',mode='a')
# log.write(f'\n--- [START {log.timestamp()}] {"-"*64}\n\n')
# log.write(f'\t{set_environment()}\n')
# log.write(f'\t__file__ = {__file__}\n')
# log.write(f'\tfold_dir = {fold_dir}\n')
# log.write(f'\n')


## dataset ----------------------------------------
# log.write('** dataset setting **\n')

    if fold_type == 'kaggle-part':
        train_df, valid_df = read_kaggle_csv_by_part(fold)
    if fold_type == 'christ-part':
        train_df, valid_df = read_christ_csv_by_part(fold)

    #train_df, valid_df = read_kaggle_random_csv(fold)
    train_dataset = SignDataset(train_df,train_augment)
    valid_dataset = SignDataset(valid_df,)
    
    train_loader  = DataLoader(
        train_dataset,
        sampler = RandomSampler(train_dataset),
        #sampler = BalanceSampler(train_dataset),
        batch_size  = batch_size,
        drop_last   = True,
        num_workers = 16,
        pin_memory  = False,
        worker_init_fn = lambda id: np.random.seed(torch.initial_seed() // 2 ** 32 + id),
        collate_fn = null_collate,
    )
 
    valid_loader = DataLoader(
        valid_dataset,
        sampler = SequentialSampler(valid_dataset),
        batch_size  = 64,
        drop_last   = False,
        num_workers = 16,
        pin_memory  = False,
        collate_fn = null_collate,
    )

    scaler = torch.cuda.amp.GradScaler(enabled = True)
    net = Net()
    #net.load_pretrain()


    
    
    start_iteration = 0
    start_epoch = 0
    net.cuda()


    ## optimiser ----------------------------------
    if 0: ##freeze
        for p in net.encoder.parameters():   p.requires_grad = False
        #for p in net.decoder.parameters():   p.requires_grad = False
        pass

    def freeze_bn(net):
        for m in net.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()
                m.weight.requires_grad = False
                m.bias.requires_grad = False
    #freeze_bn(net)

    #-----------------------------------------------

    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()),lr=start_lr)
# optimizer = Lookahead(RAdam(filter(lambda p: p.requires_grad, net.parameters()),lr=start_lr), alpha=0.5, k=5)

    num_iteration = num_epoch*len(train_loader)
    iter_log   = len(train_loader) *3
    iter_valid = iter_log
    iter_save  = iter_log
 
    ## start training here! ##############################################
    
    def message(mode='print'):
        asterisk = ' '
        if mode==('print'):
            loss = batch_loss
        if mode==('log'):
            loss = train_loss
            if (iteration % iter_save == 0): asterisk = '*'
        
        text = \
            ('%0.2e   %08d%s %6.2f | '%(rate, iteration, asterisk, epoch,)).replace('e-0','e-').replace('e+0','e+') + \
            '%4.3f  %4.3f  %4.4f  %4.3f   | '%(*valid_loss,) + \
            '%4.3f  %4.3f  %4.3f  | '%(*loss,) + \
            '%s' % (time_to_str(timer() - start_timer,'min'))
        
        return text
    
    #----
    valid_loss = np.zeros(4,np.float32)
    train_loss = np.zeros(3,np.float32)
    batch_loss = np.zeros_like(train_loss)
    sum_train_loss = np.zeros_like(train_loss)
    sum_train = 0
    

    start_timer = timer()
    iteration = start_iteration
    epoch = start_epoch
    rate = 0
    while iteration < num_iteration:
        for t, batch in enumerate(train_loader):

            if iteration%iter_save==0:
                if iteration != start_iteration:
                    n = iteration if epoch > skip_save_epoch else 0
                    torch.save({
                        'state_dict': net.state_dict(),
                        'iteration': iteration,
                        'epoch': epoch,
                    }, f'{n:08d}.model.pth')
                    pass

                    
            if (iteration%iter_valid==0): # or (t==len(train_loader)-1):
                if iteration!=start_iteration:
                    valid_loss = do_valid(net, valid_loader, f'{iteration:08d}')  #
                pass
            
                
            # learning rate schduler ------------
            # adjust_learning_rate(optimizer, scheduler(epoch))
            rate = get_learning_rate(optimizer) #scheduler.get_last_lr()[0] #get_learning_rate(optimizer)
            
            # one iteration update  -------------
            batch_size = len(batch['index'])
            batch['xyz'] = [xyz.cuda() for xyz in batch['xyz']]

            net.train()
            net.output_type = ['loss', 'inference']
    #        with torch.autograd.set_detect_anomaly(True):
            if 1:
                with torch.cuda.amp.autocast(enabled = True):
                    output = data_parallel(net,batch)#net(batch) 
                    loss0  = output['label_loss'].mean()

                optimizer.zero_grad()
                scaler.scale(
                        loss0
                ).backward()
                
                #scaler.unscale_(optimizer)
                #torch.nn.utils.clip_grad_norm_(net.parameters(), 2)
                scaler.step(optimizer)
                scaler.update()
            
            
            # print statistics  --------
            batch_loss[:3] = [loss0.item(),0,0]
            sum_train_loss += batch_loss
            sum_train += 1
            if t % 100 == 0:
                train_loss = sum_train_loss / (sum_train + 1e-12)
                sum_train_loss[...] = 0
                sum_train = 0
            
            print('\r', end='', flush=True)
            print(message(mode='print'), end='', flush=True)
            epoch += 1 / len(train_loader)
            iteration += 1
            
            # debug  --------
            #if 1:
            # if t % 100 == 0:
            # 	show_result(batch, output, resize=0.50)
            # 	cv2.waitKey(1)
            
            
        torch.cuda.empty_cache()

# main #################################################################
# if __name__ == '__main__':
#     run_train()

 

In [11]:
# net = Net()
# net.load_state_dict(torch.load('/kaggle/input/gislr-saved-models/00025200.model.pth', map_location= torch.device('cpu'))['state_dict'])

In [12]:
dum_df = pd.read_csv('/kaggle/input/asl-demo/train_prepared.csv')
dum_df = pd.DataFrame(dum_df.loc[0]).T
dum_val_dataset = SignDataset(dum_df)

In [13]:
dum_val_dataset

<__main__.SignDataset at 0x7fb183161690>

In [14]:
val_loader  = DataLoader(
    dum_val_dataset,
    sampler = RandomSampler(dum_val_dataset),
    #sampler = BalanceSampler(train_dataset),
    batch_size  = 64,
    drop_last   = False,
    num_workers = 16,
    pin_memory  = False,
    collate_fn = null_collate,
)

for t, batch in enumerate(val_loader):
    batch['xyz'] = [xyz.cpu() for xyz in batch['xyz']]
#     output = net(batch)
#     print(torch.argmax(output['sign']))

  cpuset_checked))


In [15]:
batch['xyz'][0].shape

torch.Size([23, 82, 3])

# Note batch['xyz'] is a list

# converting the model to onnx

In [16]:
max_length= 80
embed_dim = 512

# Updated SingleNet

In [17]:
#pytorch model for tflite conversion

#simplfiy for one video input 
max_length = 80  #reduce this if gets out of memory error
num_class  = 250
num_landmark = 543
num_point  = 82 
class InputNet(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.max_length = max_length 
  
    def forward(self, xyz):
        xyz = xyz - xyz[~torch.isnan(xyz)].mean(0,keepdim=True) #noramlisation to common maen
        xyz = xyz / xyz[~torch.isnan(xyz)].std(0, keepdim=True)

        LIP = [
            61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
            291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
            78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
            95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
        ]
        #LHAND = np.arange(468, 489).tolist()
        #RHAND = np.arange(522, 543).tolist()

        lip = xyz[:, LIP]
        lhand = xyz[:, 468:489]
        rhand = xyz[:, 522:543]
        xyz = torch.cat([  # (none, 82, 3)
            lip,
            lhand,
            rhand,
        ], 1)
        xyz[torch.isnan(xyz)] = 0
        x = xyz[:self.max_length]
        return x


#overwrite the model used in training ....

# use fix dimension
class MultiHeadAttention(nn.Module):
    def __init__(self,
            embed_dim,
            num_head,
            batch_first,
        ):
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim,
            num_heads=num_head,
            bias=True,
            add_bias_kv=False,
            kdim=None,
            vdim=None,
            dropout=0.0,
            batch_first=batch_first,
        )

    def forward(self, x):
        q = F.linear(x[:1], self.mha.in_proj_weight[:512], self.mha.in_proj_bias[:512]) #since we need only cls
        k = F.linear(x, self.mha.in_proj_weight[512:1024], self.mha.in_proj_bias[512:1024])
        v = F.linear(x, self.mha.in_proj_weight[1024:], self.mha.in_proj_bias[1024:]) 
        q = q.reshape(-1, 8, 64).permute(1, 0, 2)
        k = k.reshape(-1, 8, 64).permute(1, 2, 0)
        v = v.reshape(-1, 8, 64).permute(1, 0, 2)
        dot  = torch.matmul(q, k) * (1/128**0.5) # H L L
        attn = F.softmax(dot, -1)  #   L L
        out  = torch.matmul(attn, v)  #   L H dim
        out  = out.permute(1, 0, 2).reshape(-1, 512)
        out  = F.linear(out, self.mha.out_proj.weight, self.mha.out_proj.bias)  
        return out  

class FeedForward(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, embed_dim),
        )
    def forward(self, x):
        return self.mlp(x)
    
# remove mask
class TransformerBlock(nn.Module):
    def __init__(self,
        embed_dim,
        num_head,
        out_dim,
        batch_first=True,
    ):
        super().__init__()
        self.attn  = MultiHeadAttention(embed_dim, num_head,batch_first)
        self.ffn   = FeedForward(embed_dim, out_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(out_dim)

    def forward(self, x):
            
        x = x[:1] + self.attn((self.norm1(x)))
        x = x + self.ffn((self.norm2(x)))
        return x
    
def positional_encoding(length, embed_dim):
    dim = embed_dim//2

    position = np.arange(length)[:, np.newaxis]     # (seq, 1)
    dim = np.arange(dim)[np.newaxis, :]/dim   # (1, dim)

    angle = 1 / (10000**dim)         # (1, dim)
    angle = position * angle    # (pos, dim)

    pos_embed = np.concatenate(
        [np.sin(angle), np.cos(angle)],
        axis=-1
    )
    pos_embed = torch.from_numpy(pos_embed).float()
    return pos_embed
            
class SingleNet(nn.Module):

    def __init__(self, num_class=num_class):
        super().__init__()
        self.num_block = 1
        self.embed_dim = 512
        self.num_head  = 8
        self.max_length = max_length
        self.num_point = num_point

        pos_embed = positional_encoding(max_length, self.embed_dim)
        self.pos_embed = nn.Parameter(pos_embed)

        self.cls_embed = nn.Parameter(torch.zeros((1, self.embed_dim)))
        self.x_embed = nn.Sequential(
            nn.Linear(num_point * 3, self.embed_dim, bias=False),
        )

        self.encoder = nn.ModuleList([
            TransformerBlock(
                self.embed_dim,
                self.num_head,
                self.embed_dim,
                batch_first=False
            ) for i in range(self.num_block)
        ])
        self.logit = nn.Linear(self.embed_dim, num_class)

    def forward(self, xyz):
        L = xyz.shape[0]
        x_embed = self.x_embed(xyz.flatten(1)) 
        x = x_embed[:L] + self.pos_embed[:L]
        x = torch.cat([
            self.cls_embed,
            x
        ],0)
        #x = x.unsqueeze(1)

        #for block in self.encoder: x = block(x) #remove tflite loop
        x = self.encoder[0](x)
        cls = x[[0]]
        logit = self.logit(cls)
        return logit
    

In [18]:
inputnet = InputNet()
# inputnet.load_state_dict(torch.load('/kaggle/input/gislr-saved-models/00007200.model_512_80.pth', map_location= torch.device('cpu'))['state_dict'])
singlenet = SingleNet()
singlenet.load_state_dict(torch.load('/kaggle/input/gislr-saved-models/00007200.model_512_80.pth', map_location= torch.device('cpu'))['state_dict'])

<All keys matched successfully>

In [19]:
dummy_tensor = torch.zeros((23, 82, 3))
print(torch.argmax(singlenet(dummy_tensor)))

tensor(87)


In [20]:
dummy_tensor2 = batch['xyz'][0]
print(torch.argmax(singlenet(dummy_tensor2)))

tensor(25)


In [21]:
dum = torch.zeros((100,543,3))
print(inputnet(dum).shape)

torch.Size([80, 82, 3])


In [22]:
if 0:
    # converting to onnx
    dummy_inputnet = torch.zeros((100, 543, 3))
    dummy_singlenet = torch.zeros((80, 82, 3))

    # input_tensor = {'xyz': dummy_tensor,
    #     'label': 'blow'} # = batch
    singlenet.eval()

    torch.onnx.export(
        inputnet,
        dummy_inputnet,
        "GISLR_model.input.onnx",
        export_params=True,
        opset_version=12,
        do_constant_folding=True,
        input_names=['inputs'],
        output_names=['outputs'],
        dynamic_axes={
            'inputs' : {0 : 'length'}
        }
    )

    torch.onnx.export(
        singlenet,               
        dummy_singlenet,                   
        "GISLR_model.single.onnx",   
        export_params=True,  
        opset_version=12,    
        do_constant_folding=True, 
        input_names = ['inputs'],  
        output_names = ['outputs'],
        dynamic_axes={
            'inputs' : {0 : 'length'}
        }
    )

In [23]:
if 0:
    for f in ["GISLR_model.input.onnx", "GISLR_model.single.onnx"]:
        if f is None: continue
        model = onnx.load(f)
        onnx.checker.check_model(model)
        model_simple, check = onnxsim.simplify(model)
        onnx.save(model_simple, f)
    print('onnx simplify() passed !!')

# Checks and Submission

In [24]:
tflite_file = '/kaggle/input/gislr-saved-models/transformer_80.tflite'

In [25]:
import pandas as pd
import numpy as np
import os
import shutil
from datetime import datetime
from timeit import default_timer as timer

mode = 'submit'

if mode in ['debug']:  
    try:
        import tflite_runtime
    except:
        !pip install tflite-runtime

    import tflite_runtime.interpreter as tflite   
    import tflite_runtime
    print(tflite_runtime.__version__)
    #'2.11.0'
    
    import tensorflow as tf
    print(tf.__version__)
    # 2.11.0

print('import ok')

import ok


In [26]:
#helper functions
def time_to_str(t, mode='min'):
    if mode=='min':
        t  = int(t)/60
        hr = t//60
        min = t%60
        return '%2d hr %02d min'%(hr,min)

    elif mode=='sec':
        t   = int(t)
        min = t//60
        sec = t%60
        return '%2d min %02d sec'%(min,sec)

    else:
        raise NotImplementedError
        
ROWS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

if mode in ['debug']: 
 
    interpreter = tflite.Interpreter(tflite_file)
    prediction_fn = interpreter.get_signature_runner('serving_default')

    valid_df = pd.read_csv('/kaggle/input/asl-demo/train_prepared.csv') 
    valid_df = valid_df[valid_df.fold==2].reset_index(drop=True)
    valid_df = valid_df[:4_000]
    valid_num = len(valid_df)
    valid = {
        'sign':[],
    }
    
    start_timer = timer()
    for t, d in valid_df.iterrows():

        pq_file = f'/kaggle/input/asl-signs/{d.path}'
        #print(pq_file)
        xyz = load_relevant_data_subset(pq_file)

        output = prediction_fn(inputs=xyz)
        p = output['outputs'].reshape(-1)

        valid['sign'].append(p)

        #---
        if t%100==0:
            time_taken = timer() - start_timer
            print('\r %8d / %d  %s'%(t,valid_num,time_to_str(time_taken,'sec')),end='',flush=True)

    print('\n')
    
    truth = valid_df.label.values
    sign  = np.stack(valid['sign'])
    predict = np.argsort(-sign, -1)
    correct = predict==truth.reshape(valid_num,1)
    topk = correct.cumsum(-1).mean(0)[:5]


    print(f'time_taken = {time_to_str(time_taken,"sec")}')
    print(f'time_taken for LB = {time_taken*1000/valid_num:05f} msec\n')
    for i in range(5):
        print(f'topk[{i}] = {topk[i]}')  
    print('----- end -----\n')

shutil.copyfile(tflite_file, 'model.tflite') 
!zip submission.zip  'model.tflite'
!ls

print('tflite_file:', tflite_file)
print(f'submit ok')


  adding: model.tflite (deflated 8%)
__notebook__.ipynb  model.tflite  submission.zip
tflite_file: /kaggle/input/gislr-saved-models/transformer_80.tflite
submit ok


#### Skip the code below

In [27]:
# !pip install onnxsim
# !pip install onnxruntime
# !pip install onnx_tf

In [28]:
# import onnx
# import onnxruntime
# import onnxsim
# from onnx_tf.backend import prepare

# # model = onnx.load('/kaggle/working/GISLR_model.onnx') # if not already saved, uncomment this
# model = onnx.load('/kaggle/input/gislr-saved-models/GISLR_model.onnx')
# onnx.checker.check_model(model)
# model_simple, check = onnxsim.simplify(model)
# onnx.save(model_simple, "GISLR_model_simple.onnx")

In [29]:
# # converting onnx simplified to tflite
# tf_rep = prepare(onnx.load('/kaggle/input/gislr-saved-models/GISLR_model_simple.onnx'))
# tf_rep.export_graph('')
# # done these files created: variables/, assets/, saved_model.pb, fingerprint.pb

In [30]:
# import tensorflow as tf
# class TFModel(tf.Module):
#     def __init__(self):
#         super().__init__()
#         self.max_length = 80
#         self.model = tf.saved_model.load('')
#         self.model.trainable = False
    
#     @tf.function(input_signature=[
#         tf.TensorSpec(shape=[None, 543, 3], dtype=tf.float32, name='inputs')
#     ])
    
#     def call(self, xyz):
# #         xyz_mean = tf.math.reduce_mean(xyz[~tf.math.is_nan(xyz)], axis=(0, 1, 2), keepdims=True)
# #         xyz_std = tf.math.reduce_std(xyz[~tf.math.is_nan(xyz)], axis=(0, 1, 2), keepdims=True)
# #         xyz = (xyz - xyz_mean) / xyz_std  # normalize to common mean and std
#         nan_mask = tf.math.is_nan(xyz)
#         valid_mask = ~nan_mask
#         masked_xyz = tf.boolean_mask(xyz, valid_mask)
#         xyz_mean = tf.math.reduce_mean(masked_xyz, axis=0, keepdims=True)
#         xyz_std = tf.math.reduce_std(masked_xyz, axis=0, keepdims=True)
#         normalized_xyz = (xyz - xyz_mean) / xyz_std
#         xyz = tf.where(nan_mask, tf.zeros_like(xyz), normalized_xyz)

#         LIP = [
#             61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
#             291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
#             78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
#             95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
#         ]
        
#         lip_indices = tf.constant(LIP, dtype=tf.int32)
#         #LHAND = np.arange(468, 489).tolist()
#         #RHAND = np.arange(522, 543).tolist()

# #         lip = xyz[:, LIP]
#         lip = tf.gather(xyz, lip_indices, axis=1)
#         lhand = xyz[:, 468:489]
#         rhand = xyz[:, 522:543]
#         xyz = tf.concat([  # (none, 82, 3)
#             lip,
#             lhand,
#             rhand,
#         ], 1)
#         xyz = tf.where(tf.math.is_nan(xyz), tf.zeros_like(xyz), xyz)  # set NaN values to zero
#         x = xyz[:self.max_length]
#         print(x.shape)
        
#         y={}
#         y['outputs'] = self.model(**{'inputs':x})['outputs'][0]
#         return y


In [31]:
# tfmodel = TFModel()
# tf.saved_model.save(tfmodel, "", signatures={'serving_default':tfmodel.call})