In [2]:
import numpy as np
import cv2
import mediapipe as mp
from keras.src.backend.jax.image import resize
from matplotlib import pyplot as plt
import time,os

from tensorflow.python.keras.callbacks import EarlyStopping

In [3]:
def mp_detection(image,model):
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    image.flags.writeable=False
    results=model.process(image)
    image.flags.writeable=True
    image=cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
    return image,results

In [4]:
# Draw landmarks onto camera feed
def draw_landmarks(image, results):
    # Face
    if results.face_landmarks:
        mp_draw.draw_landmarks(
            image,
            results.face_landmarks,
            mp.solutions.face_mesh.FACEMESH_TESSELATION,
            landmark_drawing_spec=mp_draw.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
            connection_drawing_spec=mp_draw.DrawingSpec(color=(80, 256, 121), thickness=1)
        )

    # Pose
    if results.pose_landmarks:
        mp_draw.draw_landmarks(
            image,
            results.pose_landmarks,
            mp_holistic.POSE_CONNECTIONS,
            landmark_drawing_spec=mp_draw.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=3),
            connection_drawing_spec=mp_draw.DrawingSpec(color=(245, 66, 230), thickness=2)
        )

    # Left Hand
    if results.left_hand_landmarks:
        mp_draw.draw_landmarks(
            image,
            results.left_hand_landmarks,
            mp_holistic.HAND_CONNECTIONS,
            landmark_drawing_spec=mp_draw.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
            connection_drawing_spec=mp_draw.DrawingSpec(color=(121, 44, 250), thickness=2)
        )

    # Right Hand
    if results.right_hand_landmarks:
        mp_draw.draw_landmarks(
            image,
            results.right_hand_landmarks,
            mp_holistic.HAND_CONNECTIONS,
            landmark_drawing_spec=mp_draw.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
            connection_drawing_spec=mp_draw.DrawingSpec(color=(245, 66, 230), thickness=2)
        )

In [5]:
mp_holistic=mp.solutions.holistic # Holistic model to detect body
mp_draw=mp.solutions.drawing_utils # draw to canvas

In [26]:
"""
Right/left hand has 21 keypoints. Each keypoint is (x,y,z)
Face has 468 keypoints. Each keypoint is (x,y,z)
Pose has 33 keypoints. Each keypoint is (x,y,z,visibility)
"""
# Converts all keypoints to a combined numpy array
def extract_keypoints(results):
    rh_lk=results.right_hand_landmarks
    lh_lk=results.right_hand_landmarks
    pose_lk=results.pose_landmarks
    face_lk=results.face_landmarks

    rh=np.array([[res.x,res.y,res.z] for res in rh_lk.landmark]).flatten() if rh_lk else np.zeros(21*3)
    lh=np.array([[res.x,res.y,res.z] for res in lh_lk.landmark]).flatten() if lh_lk else np.zeros(21*3)
    face=np.array([[res.x,res.y,res.z] for res in face_lk.landmark]).flatten() if face_lk else np.zeros(468*3)
    pose=np.array([[res.x,res.y,res.z,res.visibility] for res in pose_lk.landmark]).flatten() if pose_lk else np.zeros(33*3)

    return np.concatenate([pose,face,lh,rh]) # 1-D array finally

In [33]:
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic() as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        frame, results = mp_detection(frame,holistic)
        print(results)
        draw_landmarks(frame,results)
        frame=cv2.flip(frame,1) # flip the image horizontally
        cv2.imshow('frame', frame)
        # Break gracefully
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)  # Extra key wait to fix GUI hang on macO

I0000 00:00:1750787910.614066 1199211 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1
W0000 00:00:1750787910.677827 1202492 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787910.694036 1202493 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787910.695458 1202492 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787910.695458 1202487 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787910.695739 1202489 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support 

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [34]:
results.right_hand_landmarks.landmark

[x: 0.171591043
y: 0.751055777
z: 2.43705045e-07
, x: 0.209013179
y: 0.710142672
z: -0.0144655583
, x: 0.232538804
y: 0.653808177
z: -0.0187075529
, x: 0.244895741
y: 0.602964103
z: -0.0225725453
, x: 0.241635025
y: 0.563748777
z: -0.0264175367
, x: 0.202384725
y: 0.562774241
z: -0.00276577
, x: 0.208331198
y: 0.502260327
z: -0.0107973525
, x: 0.210232943
y: 0.467582613
z: -0.0207202472
, x: 0.21099484
y: 0.434019148
z: -0.0288678277
, x: 0.177159518
y: 0.559094429
z: -0.00304999831
, x: 0.176383138
y: 0.496382028
z: -0.009471667
, x: 0.174428105
y: 0.459092021
z: -0.019669259
, x: 0.172390744
y: 0.426325202
z: -0.0282513425
, x: 0.153002053
y: 0.572413862
z: -0.00591360033
, x: 0.147092521
y: 0.51311785
z: -0.0141351828
, x: 0.14265275
y: 0.475417227
z: -0.0238859039
, x: 0.139249474
y: 0.440944254
z: -0.0315615088
, x: 0.130048886
y: 0.598787844
z: -0.0102993529
, x: 0.121963099
y: 0.55858773
z: -0.0197165962
, x: 0.11800082
y: 0.535279691
z: -0.0256499741
, x: 0.115376793
y: 0.51291

In [35]:
extract_keypoints(results).shape

(1662,)

In [6]:
DATA_PATH=os.path.join('MP_data')
actions=np.array(['Namaste','Hello','Jumping'])
no_seq=30 # 30 videos for each action
seq_len=30 # each video has 30 frames(30 numpy arrays of checkpoints)

In [7]:
for action in actions:
    for seq in range(no_seq):
        try:
            os.makedirs(os.path.join(DATA_PATH,action,str(seq)))
        except:
            pass

In [38]:
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic() as holistic:
    for action in actions:
        for seq in range(no_seq):
            for frame_no in range(seq_len):
                ret, frame = cap.read()
                frame, results = mp_detection(frame,holistic)
                print(results)
                draw_landmarks(frame,results)
                frame=cv2.flip(frame,1) # flip the image horizontally
               # Add collection logic
                if frame_no==0:
                    cv2.putText(frame,'STARTING_COLLECTION',(120,200),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2,cv2.LINE_AA)
                    cv2.putText(frame,f'Collecting frames for {action} Video no.{seq}',(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)
                    cv2.imshow('Collection Feed', frame)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(frame,f'Collecting frames for {action} Video no.{seq}',(15,12),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),1,cv2.LINE_AA)
                    cv2.imshow('Collection Feed', frame)

                # Save the checkpoint for the current frame
                keypoints=extract_keypoints(results)
                npy_path=os.path.join(DATA_PATH,action,str(seq),str(frame_no))
                np.save(npy_path,keypoints)

                # Break gracefully
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)  # Extra key wait to fix GUI hang on macO

I0000 00:00:1750787912.750039 1199211 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1
W0000 00:00:1750787912.807783 1202566 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787912.821847 1202566 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787912.823225 1202568 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787912.823230 1202564 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750787912.824144 1202563 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support 

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [9]:
label_map={label:num for num,label in enumerate(actions)}

In [10]:
label_map

{'Namaste': 0, 'Hello': 1, 'Jumping': 2}

In [11]:
X,y=[],[]
for action in actions:
    for seq in range(no_seq):
        window=[]
        for frame_no in range(seq_len):
            res=np.load(os.path.join(DATA_PATH,action,str(seq),f'{frame_no}.npy'))
            window.append(res)
        X.append(np.array(window))
        y.append(label_map[action])
X=np.array(X)
y=np.array(y)

In [12]:
X.shape

(90, 30, 1662)

In [13]:
y.shape

(90,)

In [14]:
X

array([[[ 4.66792375e-01,  4.78796840e-01, -4.94076729e-01, ...,
          1.10878818e-01,  5.36759555e-01, -1.84719291e-04],
        [ 4.66652691e-01,  4.71853137e-01, -4.93285656e-01, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 4.66481954e-01,  4.63945717e-01, -4.98444080e-01, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        ...,
        [ 4.65054393e-01,  4.58856165e-01, -3.49245399e-01, ...,
          4.51278120e-01,  5.92499375e-01, -5.18550463e-02],
        [ 4.65878308e-01,  4.59723979e-01, -3.52681249e-01, ...,
          4.51549023e-01,  5.95205307e-01, -5.05430438e-02],
        [ 4.66371477e-01,  4.60248619e-01, -3.34542125e-01, ...,
          4.52878505e-01,  5.93183756e-01, -5.09524941e-02]],

       [[ 4.66879457e-01,  4.60253894e-01, -3.26299399e-01, ...,
          4.52492863e-01,  5.94906330e-01, -5.26897311e-02],
        [ 4.67589498e-01,  4.55346197e-01, -4.35248047e-01, ...,
          0.00000000e+00,  0.00000000e

In [15]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [16]:
y=to_categorical(y).astype(int)

In [17]:
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0,

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Input
from tensorflow.keras.callbacks import TensorBoard
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [20]:
model=Sequential()
model.add(Input(shape=(X.shape[1],X.shape[2])))
model.add(LSTM(64,return_sequences=True,activation='relu'))
model.add(LSTM(128,return_sequences=True,activation='relu'))
model.add(LSTM(64,return_sequences=False,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(actions.shape[0],activation='softmax'))

In [21]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['categorical_accuracy'])

In [22]:
model.summary()

In [25]:
model.fit(X_train,y_train,epochs=2000,validation_data=(X_test,y_test),callbacks=[tb_callback,EarlyStopping(patience=200,restore_best_weights=True)])

Epoch 1/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - categorical_accuracy: 0.6888 - loss: 0.8302 - val_categorical_accuracy: 0.6667 - val_loss: 0.7900
Epoch 2/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - categorical_accuracy: 0.7476 - loss: 0.8041 - val_categorical_accuracy: 0.5556 - val_loss: 0.6883
Epoch 3/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - categorical_accuracy: 0.6351 - loss: 0.8760 - val_categorical_accuracy: 0.5556 - val_loss: 1.7431
Epoch 4/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - categorical_accuracy: 0.6787 - loss: 1.0528 - val_categorical_accuracy: 0.5556 - val_loss: 0.6574
Epoch 5/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - categorical_accuracy: 0.6061 - loss: 0.9133 - val_categorical_accuracy: 0.6667 - val_loss: 0.7588
Epoch 6/2000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

<keras.src.callbacks.history.History at 0x30052aef0>

In [53]:
y_pred=model.predict(X_test)
print(y_pred[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[7.8447156e-02 9.2148560e-01 6.7272304e-05]


In [54]:
actions[np.argmax(y_pred[3])]

'Jumping'

In [55]:
actions[np.argmax(y_test[3])]

'Jumping'

In [34]:
model.save('actions.h5')



In [35]:
del model

In [23]:
model.load_weights('actions.h5')

In [24]:
from sklearn.metrics import accuracy_score,multilabel_confusion_matrix
y_true=np.argmax(y_test,axis=1).tolist()
y_hat=np.argmax(y_pred,axis=1).tolist()
accuracy_score(y_true,y_hat)

NameError: name 'y_pred' is not defined

In [59]:
multilabel_confusion_matrix(y_true,y_hat)

array([[[5, 0],
        [0, 4]],

       [[6, 0],
        [0, 3]],

       [[7, 0],
        [0, 2]]])

In [None]:
sequence=[]
threshold=0.5

cap = cv2.VideoCapture(0)
with mp_holistic.Holistic() as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        frame, results = mp_detection(frame,holistic)
        print(results)
        draw_landmarks(frame,results)
        frame=cv2.flip(frame,1) # flip the image horizontally

        # Prediction
        keypoints=extract_keypoints(results)
        if keypoints.shape != (1662,):  # or whatever your expected shape is
            continue  # skip this frame
        sequence.append(keypoints)
        sequence=sequence[-seq_len:] # keep last 30 frames only

        if len(sequence)==seq_len:
            res=model.predict(np.expand_dims(sequence,axis=0))[0]
            toPrint=actions[np.argmax(res)]
            print(toPrint)

            if np.max(res)<threshold:
                    toPrint='Nothing detected'
            cv2.putText(frame, toPrint, (15, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3, cv2.LINE_AA)
        cv2.imshow('frame', frame)
        # Break gracefully
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)  # Extra key wait to fix GUI hang on macOs

I0000 00:00:1750792930.355900 1379009 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1
W0000 00:00:1750792930.424532 1382086 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750792930.443509 1382088 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750792930.448278 1382089 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750792930.448609 1382083 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1750792930.448695 1382085 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support 