In [15]:
import warnings
warnings.simplefilter(action='ignore')
from focal_loss import SparseCategoricalFocalLoss
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Masking, ConvLSTM1D
import glob
import json
import numpy as np
import random
import os
from sklearn.model_selection import GroupShuffleSplit 

import tensorflow_io as tfio

In [4]:
def seed_it_all(seed=7):
    """ Attempt to be Reproducible """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_it_all()

In [5]:
ROWS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

#test_data  = load_relevant_data_subset('train_landmark_files/16069/100015657.parquet')

In [6]:
LIP = [
            61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
            291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
            78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
            95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
]


In [7]:
class CFG:
    left_ROWS_per_frame = 21
    sequence_length = 20
    batch_size = 32
    face_ROWS_per_frame = 468
    lip_ROWS_per_frame = 40

labels  = json.load(open('sign_to_prediction_index_map.json','r'))
complete_df = pd.read_csv('train.csv')
from sklearn.model_selection import train_test_split
y = complete_df['sign']
train_df, test_df = train_test_split(complete_df, test_size=0.2,stratify=y)

In [8]:
complete_df.head()

Unnamed: 0,path,participant_id,sequence_id,sign
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie


In [41]:
def build_loader(with_labels=True):
    def load_video(video_path):
        #print('herer')
        video_df = tfio.IODataset.from_parquet(video_path)
        #video_df = pd.read_parquet(video_path, engine='pyarrow')
        #video_df.fillna(0,inplace=True)
        left_df = video_df[video_df.type=='left_hand']
        left_values = left_df[['x','y','z']].values
        left_values = left_values.reshape(-1,CFG.left_ROWS_per_frame,3)
        left_hand_array =  tf.image.resize(left_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        right_df = video_df[video_df.type=='right_hand']
        right_values = right_df[['x','y','z']].values
        right_values = right_values.reshape(-1,CFG.left_ROWS_per_frame,3)
        right_hand_array =  tf.image.resize(right_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        return [left_hand_array, right_hand_array]
    
    def load_video_with_labels(path, label):
        return load_video(path), labels[label]
    
    return load_video_with_labels if with_labels else load_video

In [42]:
class CustomData(tf.keras.utils.Sequence):
    def __init__(self,df,num_frames=20,batch_size=8,shuffle=True,\
                 labels_path='sign_to_prediction_index_map.json'):
        self.df = df
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_frames = num_frames
        self.labels  = json.load(open('sign_to_prediction_index_map.json','r'))
        self.on_epoch_end()
        
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.df))
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def __getitem__(self,index):
        batches = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        left_hand_input = []
        right_hand_input = []
        lip_input = []
        labels = []
        all_lengths = []
        for i,row_val in enumerate(batches):
            row = self.df.iloc[row_val]
            left_hand,right_hand,lip = self.load_video(row['path'])
            left_hand_input.append(left_hand)
            right_hand_input.append(right_hand)
            lip_input.append(lip)
            all_lengths.append(left_hand.shape[0])
            labels.append(self.labels[row['sign']])
        max_length = max(all_lengths)
        left_padded = tf.keras.preprocessing.sequence.pad_sequences(left_hand_input, \
                                                                      maxlen=max_length, \
                                                                      padding='pre',\
                                                                      value=0,
                                                                      dtype='float32')
        right_padded = tf.keras.preprocessing.sequence.pad_sequences(right_hand_input, \
                                                                      maxlen=max_length, \
                                                                      padding='pre',\
                                                                      value=0,
                                                                      dtype='float32')
        lip_padded = tf.keras.preprocessing.sequence.pad_sequences(lip_input, \
                                                                      maxlen=max_length, \
                                                                      padding='pre',\
                                                                      value=0,
                                                                      dtype='float32')
        left_padded = tf.reshape(left_padded,shape=(left_padded.shape[0],left_padded.shape[1], CFG.left_ROWS_per_frame*2))
        right_padded = tf.reshape(right_padded,shape=(right_padded.shape[0],left_padded.shape[1], CFG.left_ROWS_per_frame*2))
        lip_padded = tf.reshape(lip_padded,shape=(lip_padded.shape[0],lip_padded.shape[1], len(LIP)*2))
        return [left_padded,right_padded,lip_padded],np.asarray(labels)
            
    def load_video(self,video_path):
        video_df = pd.read_parquet(video_path, engine='pyarrow')
        video_df.fillna(0,inplace=True)
        left_df = video_df[video_df.type=='left_hand']
        left_values = left_df[['x','y']].values
        left_values = left_values.reshape(-1,CFG.left_ROWS_per_frame,2)
        #if len(left_values)!=0:
        #    left_values[:,:,0] = (left_values[:,:,0]- np.min(left_values[:,:,0]))/(left_values[:,:,0].max()- left_values[:,:,0].min())
        #    left_values[:,:,1] = (left_values[:,:,1]- np.min(left_values[:,:,1]))/(left_values[:,:,1].max()- left_values[:,:,1].min())
            #left_hand_array =  tf.image.resize(left_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        #else:
        #    left_values =  tf.zeros(shape=(CFG.sequence_length, CFG.left_ROWS_per_frame,2),dtype=tf.float32)
        right_df = video_df[video_df.type=='right_hand']
        right_values = right_df[['x','y']].values
        right_values = right_values.reshape(-1,CFG.left_ROWS_per_frame,2)
        #if len(right_values) != 0:
        #    right_values[:,:,0] = (right_values[:,:,0]- np.min(right_values[:,:,0]))/(right_values[:,:,0].max()- right_values[:,:,0].min())
        #    right_values[:,:,1] = (right_values[:,:,1]- np.min(right_values[:,:,1]))/(right_values[:,:,1].max()- right_values[:,:,1].min())
            #right_hand_array =  tf.image.resize(right_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        #else:
        #    right_values =  tf.zeros(shape=(CFG.sequence_length, CFG.left_ROWS_per_frame,2),dtype=tf.float32)
        face_df = video_df[video_df.type=='face']
        face_df = face_df[['x','y']].values
        face_df = face_df.reshape(-1,CFG.face_ROWS_per_frame,2)
        lip_values = face_df[:,LIP,:]
        #if len(lip_values) != 0:
        #    lip_values[:,:,0] = (lip_values[:,:,0]- np.min(lip_values[:,:,0]))/(lip_values[:,:,0].max()- lip_values[:,:,0].min())
        #    lip_values[:,:,1] = (lip_values[:,:,1]- np.min(lip_values[:,:,1]))/(lip_values[:,:,1].max()- lip_values[:,:,1].min())
            #lip_values =  tf.image.resize(lip_values, (CFG.sequence_length, CFG.lip_ROWS_per_frame))
        #else:
        #    lip_values =  tf.zeros(shape=(CFG.sequence_length, CFG.lip_ROWS_per_frame,2),dtype=tf.float32)
        return left_values, right_values,lip_values
    
    def __len__(self):
        return len(self.df)//self.batch_size

In [43]:
train_datagen = CustomData(train_df,num_frames=CFG.sequence_length,batch_size=CFG.batch_size)
test_datagen = CustomData(test_df,num_frames=CFG.sequence_length,batch_size=CFG.batch_size)

In [44]:
def conv1d_lstm_block(inputs, filters):
    vector = tf.keras.layers.LSTM(units=64)(inputs)
    #vector = tf.keras.layers.Dropout(0.2)(vector)
#     vector = tf.keras.layers.BatchNormalization()(vector)
#     vector = tf.keras.layers.ConvLSTM1D(filters=64, kernel_size=3,return_sequences=True)(vector)
#     vector = tf.keras.layers.Dropout(0.2)(vector)
#     vector = tf.keras.layers.BatchNormalization()(vector)
#     vector = tf.keras.layers.ConvLSTM1D(filters=64, kernel_size=3)(vector)
#     vector = tf.keras.layers.Dropout(0.2)(vector)
    return vector

def get_model():
    input1 = tf.keras.Input((None, CFG.left_ROWS_per_frame*2), dtype=tf.float32)
    input2 = tf.keras.Input((None, CFG.left_ROWS_per_frame*2), dtype=tf.float32)
    input3 = tf.keras.Input((None, CFG.lip_ROWS_per_frame*2), dtype=tf.float32)
    masking_layer = Masking(mask_value=0.0)
    masked_input1 = masking_layer(input1)
    masked_input2 = masking_layer(input2)
    masked_input3 = masking_layer(input3)
    left_hand_vector = conv1d_lstm_block(masked_input1, [64])
    right_hand_vector = conv1d_lstm_block(masked_input2, [64])
    lip_vector = conv1d_lstm_block(masked_input3, [64])
    vector = tf.keras.layers.Concatenate(axis=1)([left_hand_vector, right_hand_vector,lip_vector])
    vector = tf.keras.layers.Flatten()(vector)
    vector = tf.keras.layers.Dense(512, activation="relu")(vector)
    vector = tf.keras.layers.Dropout(0.3)(vector)
    output = tf.keras.layers.Dense(250, activation="softmax")(vector)
    model = tf.keras.Model(inputs=[input1,input2,input3], outputs=output)
    model.compile(tf.keras.optimizers.Adam(learning_rate=0.000333),loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics="accuracy")
    return model

In [45]:
model = get_model()

In [46]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, None, 42)]   0           []                               
                                                                                                  
 input_32 (InputLayer)          [(None, None, 42)]   0           []                               
                                                                                                  
 input_33 (InputLayer)          [(None, None, 80)]   0           []                               
                                                                                                  
 masking_9 (Masking)            multiple             0           ['input_31[0][0]',               
                                                                  'input_32[0][0]',         

In [None]:
file_name = "models/040123_20_21.h5"
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        file_name, 
        save_best_only=True, 
        monitor="val_accuracy",
        mode="max",
        verbose = 1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1,mode='max',verbose=1,
                              patience=3, min_lr=0.000001)
]
model.fit(train_datagen,validation_data=test_datagen,\
          epochs=30, callbacks=callbacks)

Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.64597, saving model to models/040123_20_21.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.64597 to 0.64661, saving model to models/040123_20_21.h5
Epoch 3/30
Epoch 3: val_accuracy did not improve from 0.64661
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.64661
Epoch 5/30
Epoch 5: val_accuracy did not improve from 0.64661

Epoch 5: ReduceLROnPlateau reducing learning rate to 3.3299998904112727e-06.
Epoch 6/30
Epoch 6: val_accuracy improved from 0.64661 to 0.64672, saving model to models/040123_20_21.h5
Epoch 7/30
Epoch 7: val_accuracy did not improve from 0.64672
Epoch 8/30
Epoch 8: val_accuracy did not improve from 0.64672
Epoch 9/30
Epoch 9: val_accuracy did not improve from 0.64672

Epoch 9: ReduceLROnPlateau reducing learning rate to 1e-06.
Epoch 10/30