In [1]:
import cv2
import numpy as np 
import mediapipe as mp
import tensorflow as tf

In [2]:
# Initialize MediaPipe Holistic model
mp_holistic = mp.solutions.holistic
holistic_model = mp_holistic.Holistic(
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

frame_counter = 0

all_landmarks_list = []

# Initialize drawing utilities
mp_drawing = mp.solutions.drawing_utils

In [3]:
# OPENCV
cap = cv2.VideoCapture(0)
capture = cv2.VideoCapture(0)

In [4]:
while capture.isOpened() and frame_counter < 100:
    ret, frame = capture.read()

    if not ret:
        break

    frame = cv2.resize(frame, (800, 600))
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Use holistic model to detect landmarks
    image.flags.writeable = False
    results = holistic_model.process(image)
    image.flags.writeable = True

    # Convert back to BGR for rendering
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    # Draw landmarks
    mp_drawing.draw_landmarks(
        image,
        results.face_landmarks,
        mp_holistic.FACEMESH_CONTOURS,
        landmark_drawing_spec=mp_drawing.DrawingSpec(color=(0,255,255), thickness=1, circle_radius=1),
        connection_drawing_spec=mp_drawing.DrawingSpec(color=(255,0,255), thickness=1, circle_radius=1)
    )

    mp_drawing.draw_landmarks(
        image, 
        results.right_hand_landmarks, 
        mp_holistic.HAND_CONNECTIONS
    )

    mp_drawing.draw_landmarks(
        image, 
        results.left_hand_landmarks, 
        mp_holistic.HAND_CONNECTIONS
    )

    # Display the resulting image with landmarks
    cv2.imshow('Holistic Model Landmarks', image)

    all_landmarks = []

    # Extract pose landmarks
    if results.pose_landmarks:
        pose_landmarks = [[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark]
        all_landmarks.extend(pose_landmarks)

    # Extract face landmarks
    if results.face_landmarks:
        face_landmarks = [[lm.x, lm.y, lm.z] for lm in results.face_landmarks.landmark]
        all_landmarks.extend(face_landmarks)

    # Extract left hand landmarks
    if results.left_hand_landmarks:
        left_hand_landmarks = [[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark]
        all_landmarks.extend(left_hand_landmarks)

    # Extract right hand landmarks
    if results.right_hand_landmarks:
        right_hand_landmarks = [[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark]
        all_landmarks.extend(right_hand_landmarks)

    # Append the landmarks of this frame to the list
    all_landmarks_list.append(all_landmarks)

    frame_counter += 1

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

capture.release()
cv2.destroyAllWindows()

In [5]:
# Define the expected order of landmarks
expected_landmark_order = []

# Add face landmarks (assuming 468 landmarks)
for i in range(468):
    expected_landmark_order.append(i)

# Add right hand landmarks (assuming 21 landmarks)
for i in range(468, 468 + 21):
    expected_landmark_order.append(i)

# Add left hand landmarks (assuming 21 landmarks)
for i in range(468 + 21, 468 + 21 + 21):
    expected_landmark_order.append(i)

# Add pose landmarks (assuming 33 landmarks)
for i in range(468 + 21 + 21, 468 + 21 + 21 + 33):
    expected_landmark_order.append(i)

# Find the maximum number of landmarks
max_landmarks = max(len(landmarks) for landmarks in all_landmarks_list)
# Ensure that the shape is (100, 543, 3) by padding with NaN values
padded_landmarks = []
for landmarks in all_landmarks_list:
    padded_landmarks.append(landmarks + [[np.nan, np.nan, np.nan]] * (543 - len(landmarks)))

# Convert the list of landmarks to a TensorFlow tensor
all_landmarks_tensor = tf.convert_to_tensor(padded_landmarks, dtype=tf.float32)

print("Shape of all landmarks tensor before reshaping:", all_landmarks_tensor.shape)

# Reshape the tensor to have shape (100, 1629)
all_landmarks_tensor_reshaped = tf.reshape(all_landmarks_tensor, (100, -1))

print("Shape of all landmarks tensor after reshaping:", all_landmarks_tensor_reshaped.shape)

Shape of all landmarks tensor before reshaping: (100, 543, 3)
Shape of all landmarks tensor after reshaping: (100, 1629)


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold
import gc
# !pip install ipywidgets
from tqdm.auto import tqdm
import Levenshtein
import time
import os

In [None]:
def get_model(max_len=MAX_LEN, target_len=64, dim=192, dtype='float32'):
    ################# ENCODER #################
    inp1 = tf.keras.Input((max_len,CHANNELS),dtype=dtype)
#     x = tf.keras.layers.Masking(mask_value=PAD,input_shape=(max_len,CHANNELS))(inp1)
    x = inp1
    ksize = 17
    drop_rate = 0.2
    x = tf.keras.layers.Dense(dim,use_bias=False,name='stem_conv')(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = TransformerBlock(dim,expand=2,num_heads=4,drop_rate=drop_rate,attn_dropout=0.2)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = TransformerBlock(dim,expand=2,num_heads=4,drop_rate=drop_rate,attn_dropout=0.2)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=0,strides=2)(x) #drop_rate=0 since we don't want to drop the whole output
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = TransformerBlock(dim,expand=2,num_heads=4,drop_rate=drop_rate,attn_dropout=0.2)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = Conv1DBlock(dim,ksize,expand_ratio=4,drop_rate=drop_rate)(x)
    x = TransformerBlock(dim,expand=2,num_heads=4,drop_rate=drop_rate,attn_dropout=0.2)(x)
    x = tf.keras.layers.BatchNormalization(momentum=0.95)(x)

    encoder = tf.keras.Model(inp1,x,name='encoder')

    ################# CTC DECDODER #################
    inp3 = tf.keras.Input((x.shape[1],dim),name='ctc_decoder_inp2',dtype=dtype)
    x = inp3
    x = tf.keras.layers.RNN(tf.keras.layers.GRUCell(dim), return_sequences=True)(x)
    x = tf.keras.layers.Dense(dim*2)(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(NUM_CLASSES,name='ctc_classifier')(x) #include sos, eos token
    ctc_decoder = tf.keras.Model(inp3,x,name='ctc_decoder')

    ################# ATT DECODER #################
    inp2 = tf.keras.Input((None,),name='att_decoder_inp1',dtype='int32')
    inp3 = tf.keras.Input((x.shape[1],dim),name='att_decoder_inp2',dtype=dtype)

    x = inp3
#     y = tf.keras.layers.Masking(mask_value=0,input_shape=(None,),name='att_decoder_input_masking')(inp2)
    y = inp2
    y = tf.keras.layers.Embedding(NUM_CLASSES,dim,name='att_decoder_token_emb')(y) #include sos token
    y = PosEmbedding(dim,max_len=target_len,name='att_decoder_pos_emb')(y)
    y = TransformerDecoderBlock(dim,expand=2,num_heads=4,attn_dropout=0.2,name='att_decoder_block1')(y,x,x)
    y = tf.keras.layers.Dropout(0.5)(y)
    y = tf.keras.layers.Dense(NUM_CLASSES,name='att_decoder_classifier')(y)

    decoder = tf.keras.Model([inp2,inp3],y,name='att_decoder')

    ################### MODEL #####################
    inp1 = tf.keras.Input((max_len,CHANNELS),dtype=dtype)
    inp2 = tf.keras.Input((None,),dtype='int32')

    x = inp1
    enc_out = encoder(x)
    y = inp2
    dec_out = decoder([y, enc_out])
    ctc_out = ctc_decoder(enc_out)
    model = tf.keras.Model([inp1,inp2], [dec_out,ctc_out])

    return model

model = get_model()
model.summary()