In [56]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [57]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [58]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [59]:
def draw_styled_landmarks(image, results):
    # # Draw face connections
    # mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS,
    #                          mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
    #                          mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
    #                          )
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [60]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    # face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

In [120]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('Point_Dataset_5') 

# Actions that we try to detect
actions = np.array(['forward', 'backward', 'pause-play','full-screen','normal-screen'])

# Thirty videos worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30

In [121]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [None]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, activation='relu', input_shape=(30,258)))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [124]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [125]:
model.load_weights('action.h5')

In [126]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [127]:
label_map = {label:num for num, label in enumerate(actions)}
label_map

{np.str_('forward'): 0,
 np.str_('backward'): 1,
 np.str_('pause-play'): 2,
 np.str_('full-screen'): 3,
 np.str_('normal-screen'): 4}

In [128]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        print(action,sequence)
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

forward 0
forward 1
forward 2
forward 3
forward 4
forward 5
forward 6
forward 7
forward 8
forward 9
forward 10
forward 11
forward 12
forward 13
forward 14
forward 15
forward 16
forward 17
forward 18
forward 19
forward 20
forward 21
forward 22
forward 23
forward 24
forward 25
forward 26
forward 27
forward 28
forward 29
backward 0
backward 1
backward 2
backward 3
backward 4
backward 5
backward 6
backward 7
backward 8
backward 9
backward 10
backward 11
backward 12
backward 13
backward 14
backward 15
backward 16
backward 17
backward 18
backward 19
backward 20
backward 21
backward 22
backward 23
backward 24
backward 25
backward 26
backward 27
backward 28
backward 29
pause-play 0
pause-play 1
pause-play 2
pause-play 3
pause-play 4
pause-play 5
pause-play 6
pause-play 7
pause-play 8
pause-play 9
pause-play 10
pause-play 11
pause-play 12
pause-play 13
pause-play 14
pause-play 15
pause-play 16
pause-play 17
pause-play 18
pause-play 19
pause-play 20
pause-play 21
pause-play 22
pause-play 23
paus

In [129]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
y_test.shape

(8, 5)

In [130]:
res = model.predict(X_test)
for i in range(8):
  print('\nPredicted : ',actions[np.argmax(res[i])])
  print('Actual : ',actions[np.argmax(y_test[i])])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 319ms/step

Predicted :  pause-play
Actual :  pause-play

Predicted :  forward
Actual :  forward

Predicted :  full-screen
Actual :  full-screen

Predicted :  forward
Actual :  forward

Predicted :  backward
Actual :  backward

Predicted :  pause-play
Actual :  pause-play

Predicted :  forward
Actual :  forward

Predicted :  forward
Actual :  forward


In [131]:
from sklearn.metrics import multilabel_confusion_matrix

In [132]:
inp=X
out=y

yhat = model.predict(inp)
ytrue = np.argmax(out, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()
multilabel_confusion_matrix(ytrue, yhat)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  


array([[[120,   0],
        [  0,  30]],

       [[119,   1],
        [  0,  30]],

       [[120,   0],
        [  1,  29]],

       [[120,   0],
        [  1,  29]],

       [[119,   1],
        [  0,  30]]])

In [133]:
from pynput.keyboard import Key, Controller
kb=Controller()

commands={
    'forward':Key.right,
    'backward':Key.left,
    'full-screen':'f',
    'normal-screen':Key.esc,
    'pause-play':Key.space
}

In [134]:
kb.press('a')
kb.release('a')

In [None]:
# 1. New detection variables
sequence = []
threshold = 0.97

cap = cv2.VideoCapture(0, cv2.CAP_AVFOUNDATION)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        # print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]

        temp_seq=sequence
        
        if len(temp_seq) == 30:
            res = model.predict(np.expand_dims(temp_seq, axis=0))[0]
            result=actions[np.argmax(res)]
            if np.max(res) > threshold:
                print(np.max(res),result)
                kb.press(commands[result])
                kb.release(commands[result])
            sequence=[]
            
            
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)  # Small delay to ensure windows close properly

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
0.99988174 backward
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
0.9994098 pause-play
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
1.0 backward
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
0.999355 forward
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
0.99994636 backward
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
0.9999889 backward
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
0.99999297 backward
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/