In [None]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
from glob import glob
import shutil
import math
import json
import subprocess
from tensorflow.keras.utils import to_categorical

In [3]:
mp_drawing = mp.solutions.drawing_utils #Drawing utilities
mp_holistic = mp.solutions.holistic   #Holistic model

In [4]:
frame_width, frame_height = (256,256)

In [37]:
num_frame_per_video = 25

In [6]:
with open("WLASL/start_kit/WLASL_v0.3.json", "r") as file:
    content = json.load(file)

In [7]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #color conversion
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [8]:
def draw_landmark(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [9]:
def draw_styled_landmark(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                                mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1))
    
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                                 mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))
    
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                 mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))
    
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                 mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))

In [10]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [11]:
def blank_keypoints():
    pose = np.zeros(132)
    face = np.zeros(1404)
    lh = np.zeros(21*3)
    rh = np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [12]:
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

In [13]:
def makedirbyname(path):
    try:
        os.mkdir(path)
    except:
        shutil.rmtree(path)
        os.mkdir(path)

In [14]:
def get_data(train_percent=0.90):
    train_sequences, train_labels = [], []
    test_sequences, test_labels = [], []
    words = glob("Data_Numpy/*")
    for word in words:
        actual_label = word.split("/")[-1]
        videos = glob(word + "/*")
        #print(f"-------------{actual_label}-------")
        train_proportion = math.ceil(round(train_percent * len(videos)))
        for index,video_ in enumerate(videos):
            #print(f"********---> Video   {video_} ----- ")
            window = []
            for frame_num in range(1,num_frame_per_video+1):
                np_file = f"{video_}/{frame_num}.npy"
                #print(np_file)
                res = np.load(np_file)
                window.append(res)
            if index <= train_proportion:
                train_sequences.append(window)
                train_labels.append(label_map[actual_label])
            else:
                test_sequences.append(window)
                test_labels.append(label_map[actual_label])
                
    train_sequences = np.array(train_sequences)
    train_labels = np.array(train_labels)
    test_sequences = np.array(test_sequences)
    test_labels = np.array(test_labels)
                
    return train_sequences, train_labels, test_sequences, test_labels


In [15]:
def get_num_frame(filename):
    video_frame_count = 0
    try:
        cap = cv2.VideoCapture(filename)
        video_frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    except:
        pass
    finally:
        cap.release()
    return video_frame_count

In [16]:
def get_duration(filename):
    result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
                             "format=duration", "-of",
                             "default=noprint_wrappers=1:nokey=1", filename],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT)
    return float(result.stdout)

In [17]:
def create_folders_video(content):
    for i in content:
        folder_path = f"Data/{i['gloss']}"
        createFolder(folder_path)
        for j in i['instances']:
            if os.path.exists(f"WLASL/start_kit/videos/{j['video_id']}.mp4"):
                shutil.copy(f"WLASL/start_kit/videos/{j['video_id']}.mp4", f"Data/{i['gloss']}/{j['video_id']}.mp4")

In [18]:
def get_actions(content):
    actions = []
    for i in content:
        actions.append(i['gloss'])
    return actions

In [25]:
keypoints_blank = blank_keypoints()

In [26]:
keypoints_blank.shape

(1662,)

In [31]:
actions = get_actions(content)

In [33]:
actions = np.array(actions)

In [34]:
actions.shape

(100,)

In [35]:
label_map = {label:num for num,label in enumerate(actions)}

## Generate numpy data for videos

In [None]:
words = glob("Data/*")
makedirbyname("Data_Numpy")
for word in words:
    videos = glob(word + "/*")
    for video in videos:
        video_path = "/".join("".join(video.split(".")[0:-1]).split("/")[1:])
        createFolder(f"Data_Numpy/{video_path}")
        print(f"Processing collection for... Data_Numpy/{video_path}")
        actual_video_frame = get_num_frame(video)
        if actual_video_frame >= 75:
            remind = actual_video_frame - 75
            to_start = int(remind / 3)
            j = 0
            index = 0
            try:
                cap = cv2.VideoCapture(video)
                with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
                    while True:
                        ret, frame = cap.read()
                        j+=1
                        if j < to_start or j%3 != to_start%3:
                            continue
                        
                        index+=1
                        if index > 25:
                            break
                            
                        if ret == False:
                            cap.release()
                            break
                            
                            
                        #Resize the frame
                        resize_frame = cv2.resize(frame, (frame_width, frame_height))

                        # Make detections
                        image, results = mediapipe_detection(resize_frame, holistic)

                        # Drawing landmarks
                        draw_styled_landmark(image, results)

                        #Extracting keypoints
                        keypoints = extract_keypoints(results)
                        
                        if not (keypoints==keypoints_blank).all():
                            last_keypoints = keypoints
                        else:
                            index-=1
                            continue
                            
                        #Saving data
                        np.save(f"Data_Numpy/{video_path}/{index}", keypoints)
                            
                        cv2.imshow(video, image)
                        if cv2.waitKey(10) and 0xFF == ord("q"):
                            break
                            
                    cap.release()
                    cv2.destroyAllWindows()
            finally:
                while index <=25:
                    np.save(f"Data_Numpy/{video_path}/{index}", last_keypoints)
                    index+=1
                cap.release()
                cv2.destroyAllWindows()
        elif actual_video_frame >=50:
            remind = actual_video_frame - 50
            to_start = int(remind / 3)
            j = 0
            index = 0
            try:
                cap = cv2.VideoCapture(video)
                with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
                    while True:
                        ret, frame = cap.read()
                        j+=1

                        if j < to_start or j%2 != to_start%2:
                            continue

                        index+=1
                        if index > 25:
                            break

                        if ret == False:
                            cap.release()
                            break
                        
                        #Resize the frame
                        resize_frame = cv2.resize(frame, (frame_width, frame_height))

                        # Make detections
                        image, results = mediapipe_detection(resize_frame, holistic)

                        # Drawing landmarks
                        draw_styled_landmark(image, results)

                        #Extracting keypoints
                        keypoints = extract_keypoints(results)
                        
                        if not (keypoints==keypoints_blank).all():
                            last_keypoints = keypoints
                        else:
                            index-=1
                            continue
                            
                        #Saving data
                        np.save(f"Data_Numpy/{video_path}/{index}", keypoints)

                        cv2.imshow(video, image)
                        if cv2.waitKey(10) and 0xFF == ord("q"):
                            break

                    cap.release()
                    cv2.destroyAllWindows()
            finally:
                while index <=25:
                    np.save(f"Data_Numpy/{video_path}/{index}", last_keypoints)
                    index+=1
                cap.release()
                cv2.destroyAllWindows()
        elif actual_video_frame >=25:
            remind = actual_video_frame - 25
            to_start = int(remind / 3)
            j = 0
            index = 0
            try:
                cap = cv2.VideoCapture(video)
                with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
                    while True:
                        ret, frame = cap.read()
                        j+=1

                        if j < to_start:
                            continue

                        index+=1
                        if index > 25:
                            break

                        if ret == False:
                            cap.release()
                            break
                            
                        #Resize the frame
                        resize_frame = cv2.resize(frame, (frame_width, frame_height))

                        # Make detections
                        image, results = mediapipe_detection(resize_frame, holistic)

                        # Drawing landmarks
                        draw_styled_landmark(image, results)

                        #Extracting keypoints
                        keypoints = extract_keypoints(results)
                        
                        if not (keypoints==keypoints_blank).all():
                            last_keypoints = keypoints
                        else:
                            index-=1
                            continue
                            
                        #Saving data
                        np.save(f"Data_Numpy/{video_path}/{index}", keypoints)

                        cv2.imshow(video, image)
                        if cv2.waitKey(10) and 0xFF == ord("q"):
                            break

                    cap.release()
                    cv2.destroyAllWindows()
            finally:
                while index <=25:
                    np.save(f"Data_Numpy/{video_path}/{index}", last_keypoints)
                    index+=1
                cap.release()
                cv2.destroyAllWindows()
        else:
            j = 0
            index = 0
            try:
                cap = cv2.VideoCapture(video)
                with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
                    while True:
                        ret, frame = cap.read()
                        j+=1

                        if j < to_start:
                            continue

                        index+=1
                        if index > 25:
                            break

                        if ret == False:
                            cap.release()
                            break
                            
                        #Resize the frame
                        resize_frame = cv2.resize(frame, (frame_width, frame_height))

                        # Make detections
                        image, results = mediapipe_detection(resize_frame, holistic)

                        # Drawing landmarks
                        draw_styled_landmark(image, results)

                        #Extracting keypoints
                        keypoints = extract_keypoints(results)
                        
                        if not (keypoints==keypoints_blank).all():
                            last_keypoints = keypoints
                        else:
                            index-=1
                            continue
                            
                        #Saving data
                        np.save(f"Data_Numpy/{video_path}/{index}", keypoints)

                        cv2.imshow(video, image)
                        last_frame = frame
                        if cv2.waitKey(10) and 0xFF == ord("q"):
                            break

                    cap.release()
                    cv2.destroyAllWindows()
            finally:
                while index <=25:
                    np.save(f"Data_Numpy/{video_path}/{index}", last_keypoints)
                    cv2.imshow(video, last_frame)
                    index+=1
                cap.release()
                cv2.destroyAllWindows() 

### Setup data for training

In [38]:
train_sequences, train_labels, test_sequences, test_labels = get_data(0.8)

In [39]:
train_sequences.shape

(1263, 25, 1662)

In [40]:
train_labels.shape

(1263,)

In [41]:
test_sequences.shape

(192, 25, 1662)

In [42]:
test_labels.shape

(192,)

In [43]:
X = train_sequences

In [44]:
X.shape

(1263, 25, 1662)

In [47]:
y = to_categorical(train_labels).astype(int)

In [49]:
y[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Train the network

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam

In [68]:
log_dir = os.path.join("Logs")
tb_callback = TensorBoard(log_dir=log_dir)

In [69]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(num_frame_per_video,1662)))
#model.add(LSTM(128, return_sequences=True, activation='relu'))
#model.add(LSTM(256, return_sequences=True, activation='relu'))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [70]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
model.fit(X, y, epochs=500, batch_size=128,callbacks=[tb_callback])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
