In [1]:
import warnings
warnings.simplefilter(action='ignore')
from focal_loss import SparseCategoricalFocalLoss
import pandas as pd
import tensorflow as tf
import glob
import json
import numpy as np
import random
import os
import tensorflow_io as tfio

In [2]:
def seed_it_all(seed=7):
    """ Attempt to be Reproducible """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_it_all()

In [3]:
ROWS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

#test_data  = load_relevant_data_subset('train_landmark_files/16069/100015657.parquet')

In [4]:
LIP = [
            61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
            291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
            78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
            95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
]


In [5]:
class CFG:
    left_ROWS_per_frame = 21
    sequence_length = 20
    batch_size = 32
    face_ROWS_per_frame = 468
    lip_ROWS_per_frame = 40

labels  = json.load(open('sign_to_prediction_index_map.json','r'))
complete_df = pd.read_csv('train.csv')
from sklearn.model_selection import train_test_split
y = complete_df['sign']
train_df, test_df = train_test_split(complete_df, test_size=0.2,stratify=y)

In [6]:
def build_loader(with_labels=True):
    def load_video(video_path):
        #print('herer')
        video_df = tfio.IODataset.from_parquet(video_path)
        #video_df = pd.read_parquet(video_path, engine='pyarrow')
        #video_df.fillna(0,inplace=True)
        left_df = video_df[video_df.type=='left_hand']
        left_values = left_df[['x','y','z']].values
        left_values = left_values.reshape(-1,CFG.left_ROWS_per_frame,3)
        left_hand_array =  tf.image.resize(left_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        right_df = video_df[video_df.type=='right_hand']
        right_values = right_df[['x','y','z']].values
        right_values = right_values.reshape(-1,CFG.left_ROWS_per_frame,3)
        right_hand_array =  tf.image.resize(right_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        return [left_hand_array, right_hand_array]
    
    def load_video_with_labels(path, label):
        return load_video(path), labels[label]
    
    return load_video_with_labels if with_labels else load_video

In [7]:
class CustomData(tf.keras.utils.Sequence):
    def __init__(self,df,num_frames=20,batch_size=8,shuffle=True,\
                 labels_path='sign_to_prediction_index_map.json'):
        self.df = df
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_frames = num_frames
        self.labels  = json.load(open('sign_to_prediction_index_map.json','r'))
        self.on_epoch_end()
        
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.df))
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def __getitem__(self,index):
        batches = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        left_hand_input = np.zeros(shape=(self.batch_size,self.num_frames,CFG.left_ROWS_per_frame,2))
        right_hand_input = np.zeros(shape=(self.batch_size,self.num_frames,CFG.left_ROWS_per_frame,2))
        lip_input = np.zeros(shape=(self.batch_size,self.num_frames,CFG.lip_ROWS_per_frame,2))
        labels = []
        for i,row_val in enumerate(batches):
            row = self.df.iloc[row_val]
            left_hand,right_hand,lip = self.load_video(row['path'])
            left_hand_input[i,:] = left_hand
            right_hand_input[i,:] = right_hand
            lip_input[i,:] = lip
            labels.append(self.labels[row['sign']])
        return [left_hand_input,right_hand_input,lip_input],np.asarray(labels)
            
    def load_video(self,video_path):
        video_df = pd.read_parquet(video_path, engine='pyarrow')
        video_df.dropna(inplace=True)
        left_df = video_df[video_df.type=='left_hand']
        left_values = left_df[['x','y']].values
        left_values = left_values.reshape(-1,CFG.left_ROWS_per_frame,2)
        if len(left_values)!=0:
            left_values[:,:,0] = (left_values[:,:,0]- np.min(left_values[:,:,0]))/(left_values[:,:,0].max()- left_values[:,:,0].min())
            left_values[:,:,1] = (left_values[:,:,1]- np.min(left_values[:,:,1]))/(left_values[:,:,1].max()- left_values[:,:,1].min())
            left_hand_array =  tf.image.resize(left_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        else:
            left_hand_array =  tf.zeros(shape=(CFG.sequence_length, CFG.left_ROWS_per_frame,2),dtype=tf.float32)
        right_df = video_df[video_df.type=='right_hand']
        right_values = right_df[['x','y']].values
        right_values = right_values.reshape(-1,CFG.left_ROWS_per_frame,2)
        if len(right_values) != 0:
            right_values[:,:,0] = (right_values[:,:,0]- np.min(right_values[:,:,0]))/(right_values[:,:,0].max()- right_values[:,:,0].min())
            right_values[:,:,1] = (right_values[:,:,1]- np.min(right_values[:,:,1]))/(right_values[:,:,1].max()- right_values[:,:,1].min())
            right_hand_array =  tf.image.resize(right_values, (CFG.sequence_length, CFG.left_ROWS_per_frame))
        else:
            right_hand_array =  tf.zeros(shape=(CFG.sequence_length, CFG.left_ROWS_per_frame,2),dtype=tf.float32)
        face_df = video_df[video_df.type=='face']
        face_df = face_df[['x','y']].values
        face_df = face_df.reshape(-1,CFG.face_ROWS_per_frame,2)
        lip_values = face_df[:,LIP,:]
        if len(lip_values) != 0:
            lip_values[:,:,0] = (lip_values[:,:,0]- np.min(lip_values[:,:,0]))/(lip_values[:,:,0].max()- lip_values[:,:,0].min())
            lip_values[:,:,1] = (lip_values[:,:,1]- np.min(lip_values[:,:,1]))/(lip_values[:,:,1].max()- lip_values[:,:,1].min())
            lip_values_array =  tf.image.resize(lip_values, (CFG.sequence_length, CFG.lip_ROWS_per_frame))
        else:
            lip_values_array =  tf.zeros(shape=(CFG.sequence_length, CFG.lip_ROWS_per_frame,2),dtype=tf.float32)
        return left_hand_array, right_hand_array,lip_values_array
    
    def __len__(self):
        return len(self.df)//self.batch_size

In [8]:
train_datagen = CustomData(train_df,num_frames=CFG.sequence_length,batch_size=CFG.batch_size)
test_datagen = CustomData(test_df,num_frames=CFG.sequence_length,batch_size=CFG.batch_size)

In [22]:
def conv1d_lstm_block(inputs, filters):
    vector = tf.keras.layers.BatchNormalization()(inputs)
    vector = tf.keras.layers.Conv2D(filters=32, kernel_size=3)(vector)
    vector = tf.keras.layers.MaxPool2D()(vector)
    vector = tf.keras.layers.BatchNormalization()(vector)
    vector = tf.keras.layers.Conv2D(filters=64, kernel_size=3)(vector)
    vector = tf.keras.layers.MaxPool2D()(vector)
    vector = tf.keras.layers.BatchNormalization()(vector)
    vector = tf.keras.layers.Conv2D(filters=64, kernel_size=3)(vector)
    vector = tf.keras.layers.Flatten()(vector)
#     #vector = tf.keras.layers.Dropout(0.2)(vector)
#     vector = tf.keras.layers.BatchNormalization(axis=-1)(vector)
#     vector = tf.keras.layers.ConvLSTM1D(filters=64, kernel_size=3,return_sequences=True,padding='same',\
#                                         kernel_regularizer=tf.keras.regularizers.L2(l2=0.01),\
#                                         recurrent_regularizer=tf.keras.regularizers.L2(l2=0.01))(vector)
#     #vector = tf.keras.layers.Dropout(0.2)(vector)
#     vector = tf.keras.layers.BatchNormalization(axis=-1)(vector)
#     vector = tf.keras.layers.ConvLSTM1D(filters=64, kernel_size=3,padding='same',\
#                                        kernel_regularizer=tf.keras.regularizers.L2(l2=0.01),\
#                                         recurrent_regularizer=tf.keras.regularizers.L2(l2=0.01))(vector)
#     #vector = tf.keras.layers.Dropout(0.2)(vector)
    return vector

def get_model():
    input1 = tf.keras.Input((CFG.sequence_length, CFG.left_ROWS_per_frame, 2), dtype=tf.float32)
    input2 = tf.keras.Input((CFG.sequence_length, CFG.left_ROWS_per_frame, 2), dtype=tf.float32)
    input3 = tf.keras.Input((CFG.sequence_length, CFG.lip_ROWS_per_frame, 2), dtype=tf.float32)
    left_hand_vector = conv1d_lstm_block(input1, [64])
    right_hand_vector = conv1d_lstm_block(input2, [64])
    lip_vector = conv1d_lstm_block(input3, [64])
    vector = tf.keras.layers.Concatenate(axis=1)([left_hand_vector, right_hand_vector,lip_vector])
    vector = tf.keras.layers.Flatten()(vector)
    vector = tf.keras.layers.Dense(512, activation="relu")(vector)
    vector = tf.keras.layers.Dropout(0.3)(vector)
    output = tf.keras.layers.Dense(250, activation="softmax")(vector)
    model = tf.keras.Model(inputs=[input1,input2,input3], outputs=output)
    model.compile(tf.keras.optimizers.Adam(0.000333),loss=SparseCategoricalFocalLoss(gamma=2), metrics="accuracy")
    return model

In [23]:
model = get_model()

In [24]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_16 (InputLayer)          [(None, 20, 21, 2)]  0           []                               
                                                                                                  
 input_17 (InputLayer)          [(None, 20, 21, 2)]  0           []                               
                                                                                                  
 input_18 (InputLayer)          [(None, 20, 40, 2)]  0           []                               
                                                                                                  
 batch_normalization_11 (BatchN  (None, 20, 21, 2)   8           ['input_16[0][0]']               
 ormalization)                                                                              

                                                                                                  
Total params: 560,178
Trainable params: 559,590
Non-trainable params: 588
__________________________________________________________________________________________________


In [None]:
file_name = "models/032623_15_16.h5"
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        file_name, 
        save_best_only=True, 
        monitor="val_accuracy",
        mode="max",
        verbose = 1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1,mode='max',verbose=1,
                              patience=3, min_lr=0.000001)
]
model.fit(train_datagen,validation_data=test_datagen,\
          epochs=30, callbacks=callbacks)

Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.39518, saving model to models/032623_15_16.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.39518 to 0.47850, saving model to models/032623_15_16.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.47850 to 0.51849, saving model to models/032623_15_16.h5
Epoch 4/30
Epoch 4: val_accuracy improved from 0.51849 to 0.54894, saving model to models/032623_15_16.h5
Epoch 5/30
Epoch 5: val_accuracy improved from 0.54894 to 0.57246, saving model to models/032623_15_16.h5
Epoch 6/30
Epoch 6: val_accuracy improved from 0.57246 to 0.57564, saving model to models/032623_15_16.h5
Epoch 7/30
Epoch 7: val_accuracy improved from 0.57564 to 0.59560, saving model to models/032623_15_16.h5
Epoch 8/30
Epoch 8: val_accuracy improved from 0.59560 to 0.59952, saving model to models/032623_15_16.h5
Epoch 9/30
Epoch 9: val_accuracy improved from 0.59952 to 0.60805, saving model to models/032623_15_16.h5
Epoch 10/30
Epoch 10: val_accuracy did not improv