In [None]:
!pip install tensorflow==2.8 tensorflow-gpu==2.8 opencv-python mediapipe scikit-learn matplotlib  protobuf==3.20.1

In [2]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import tensorflow as tf

In [None]:
#Importing the Mediapipe library and its holistic model for holistic human pose estimation, along with drawing utilities for visualization.

In [3]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [None]:
#A function to perform Mediapipe detection on an image using a specified model, converting the image from BGR to RGB for processing, 
#making predictions, and then converting it back from RGB to BGR before returning the result alongside the detection results.

In [4]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [None]:
#A function to draw landmarks on the image using results from Mediapipe detection, including facial, pose, left hand, and right hand 
#landmarks with corresponding connections.

In [5]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [None]:
#A function to draw landmarks on the image with customized styles for different body parts, including facial, pose, left hand, and right hand 
#landmarks, each with specified color, thickness, and circle radius.

In [6]:
#Adding customization options for the style of the drawn landmarks
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [None]:
#A script to capture video from a webcam, process it using the Mediapipe Holistic model for human pose estimation, draw styled landmarks on 
#the video feed, and display the processed video in real-time. Press 'q' to exit the program gracefully.

In [7]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [7]:
draw_landmarks(frame, results)

In [None]:
len(results.left_hand_landmarks.landmark)

In [None]:
#A function to extract keypoints from the Mediapipe detection results, including pose, face, left hand, and right hand landmarks. It returns
#a flattened array of keypoints, with each landmark represented by its x, y, z coordinates and visibility. If no landmarks are detected, it 
#returns arrays of zeros with predefined lengths for each body part.

In [9]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [None]:
#A variable named `result_test` containing the extracted keypoints from the `results` obtained from the Mediapipe detection process.

In [10]:
result_test = extract_keypoints(results)

In [11]:
result_test

array([ 0.45078254,  0.23680118, -1.35879302, ...,  0.        ,
        0.        ,  0.        ])

In [None]:
#Saving the extracted keypoints from the `results` as a NumPy array with filename '0.npy' using `np.save()`, and then loading the saved array 
#back into memory using `np.load()`.

In [12]:
np.save('0', result_test)

In [13]:
np.load('0.npy')

array([ 0.45078254,  0.23680118, -1.35879302, ...,  0.        ,
        0.        ,  0.        ])

In [None]:
#Defining paths and parameters for the ASL (American Sign Language) dataset, including the directory path for exported data (numpy arrays), 
#the actions to be detected, the number of video sequences, the length of each sequence in frames, and the starting folder index.

In [68]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('ASL_V2_Data') 

# Actions that we try to detect
actions = np.array(['Hello', 'Yes', 'Help', 'ThankYou', 'Food', 'Please', 'Finish'])

# Thirty videos worth of data
no_sequences = 120

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
#start_folder = 60



In [None]:
#A script to create directories for storing data corresponding to different actions and video sequences within the ASL dataset. 
#If the directories don't exist, it creates them based on the defined actions and sequence numbers. If directories already exist, it skips 
#the creation process.

In [69]:
#Run when you want to append new actions to the directory
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except FileExistsError:
            pass

In [None]:
#A script to create directories for storing data corresponding to different actions and video sequences within the ASL dataset. 
#This script should be run only when initially setting up the folders. It attempts to create directories based on the defined actions and 
#sequence numbers, and it skips the creation process if the directories already exist.

In [None]:
#Only run when creating the folders initially
for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [None]:
#A script to collect additional videos for a specific action within the ASL dataset. It finds the maximum directory number for the given action,
#increments it, and creates new directories for additional video sequences within that action. If the directories already exist, it skips the 
#creation process.

In [None]:
#Run when you want to collect additional videos for an action
for action in actions: 
    dirmax = np.max(np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int))
    for sequence in range(1,no_sequences+1):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(dirmax+sequence)))
        except:
            pass

In [18]:
#print(actions[8:])

['Friend']


In [None]:
#Collecting keypoints for training and testing
#Captures video from a webcam, performs Mediapipe holistic detection, and collects data for actions defined in the ASL dataset. It iterates 
#through each action and sequence, displaying real-time video with landmarks drawn. It also exports keypoints data for each frame of each 
#sequence into corresponding directories. Press 'q' to exit the program.

In [70]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # NEW LOOP
    # Loop through actions
    #for action in actions:
    for action in actions[8:]:
        # Loop through sequences aka videos
        #for sequence in range(start_folder, start_folder+no_sequences):
        for sequence in range(no_sequences):
            # Loop through video length aka sequence length
            for frame_num in range(sequence_length):

                # Read feed
                ret, frame = cap.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)
#                 print(results)

                # Draw landmarks
                draw_styled_landmarks(image, results)
                
                # NEW Apply wait logic
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(2000)
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                
                # NEW Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
                    
    cap.release()
    cv2.destroyAllWindows()

In [None]:
cap.release()
cv2.destroyAllWindows()

In [None]:
#Preprocessing the data, creating labels, and features

In [71]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [None]:
#Creating a python dictionary `label_map` that maps each action label to a numerical value, with the numerical values assigned based on the order of 
#actions in the `actions` array.

In [72]:
label_map = {label:num for num, label in enumerate(actions)}

In [73]:
label_map

{'Hello': 0,
 'Yes': 1,
 'Help': 2,
 'ThankYou': 3,
 'Food': 4,
 'Please': 5,
 'Finish': 6,
 'Time': 7,
 'ILoveYou': 8}

In [None]:
#This code iterates through each action in the ASL dataset, then for each sequence within that action, it loads the keypoints data for each 
#frame in the sequence. It constructs a window of frames for each sequence and appends it to the `sequences` list, while appending the 
#corresponding label to the `labels` list using the `label_map` dictionary.

In [74]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [75]:
np.array(sequences).shape

(1080, 30, 1662)

In [76]:
np.array(labels).shape

(1080,)

In [None]:
#Converting the list of sequences into NumPy arrays 'X'

In [77]:
X = np.array(sequences)

In [78]:
X.shape

(1080, 30, 1662)

In [None]:
#This line of code converts the labels array into one-hot encoded format using the `to_categorical` function and then casts the result to an 
#integer type.

In [80]:
y = to_categorical(labels).astype(int)

In [None]:
y

In [None]:
#Splits the data into training and testing sets using a 80-20 ratio

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [82]:
y_test.shape

(216, 9)

In [33]:
actions.shape[0]

9

In [None]:
#Build and train LSTM neural network

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, LeakyReLU
from tensorflow.keras.callbacks import TensorBoard

In [None]:
#This code creates a directory named "Logs" using `os.path.join`, and then sets up a TensorBoard callback named `tb_callback` to log data to 
#this directory during model training.

In [35]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
#This code defines a sequential model using Keras. It consists of three LSTM layers with increasing units, followed by Leaky ReLU activation 
#functions and dropout regularization. Afterward, it includes two dense layers with ReLU activation and a final dense layer with softmax 
#activation for multi-class classification. The model is compiled using the Adam optimizer and categorical cross-entropy loss. 
#It is then trained on the training data (`X_train` and `y_train`) for 120 epochs, with validation data provided (`X_test` and `y_test`), 
#and TensorBoard callback (`tb_callback`) for logging.

In [83]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(30, 1662)))
model.add(LeakyReLU(alpha=0.1))  # Leaky ReLU activation
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=True))
model.add(LeakyReLU(alpha=0.1))  # Leaky ReLU activation
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=False))
model.add(LeakyReLU(alpha=0.1))  # Leaky ReLU activation
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=120, validation_data=(X_test, y_test),callbacks=[tb_callback])


Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

<keras.callbacks.History at 0x22fb662b0a0>

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 30, 64)            442112    
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 30, 64)            0         
                                                                 
 dropout (Dropout)           (None, 30, 64)            0         
                                                                 
 lstm_1 (LSTM)               (None, 30, 128)           98816     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 30, 128)           0         
                                                                 
 dropout_1 (Dropout)         (None, 30, 128)           0         
                                                                 
 lstm_2 (LSTM)               (None, 64)                4

In [None]:
#Model Predictions

In [None]:
#Uses the trained model to predict the outputs for the test data `X_test`, storing the predictions in the variable `res`.

In [84]:
res = model.predict(X_test)

In [None]:
#Retrieves the action label corresponding to the highest predicted probability from the model's output for the corresponding sample in the test data.

In [85]:
actions[np.argmax(res[7])]

'Please'

In [None]:
#Retrieves the action label corresponding to the highest value in the ground truth labels for the corresponding sample in the test data.

In [86]:
actions[np.argmax(y_test[7])]

'Please'

In [None]:
#Save Weights

In [44]:
model.save('ASLV7.h5')

In [None]:
#accuracy_score(ytrue, yhat)

In [None]:
model = tf.keras.models.load_model('ASLV6.h5')

In [None]:
#Evaluation using Confusion Matrix and Accuracy

In [59]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [None]:
#Uses the trained model to predict the outputs for the test data `X_test`, storing the predictions in the variable `yhat`.

In [87]:
yhat = model.predict(X_test)

In [None]:
#This code converts the one-hot encoded ground truth labels `y_test` into their corresponding class indices and stores them in a list 
#named `ytrue`. Similarly, it converts the model predictions `yhat` into their corresponding class indices and stores them in a list
#named `yhat`.

In [88]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [89]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[193,   2],
        [  0,  21]],

       [[195,   0],
        [  0,  21]],

       [[189,   0],
        [  0,  27]],

       [[193,   2],
        [  0,  21]],

       [[198,   0],
        [  1,  17]],

       [[179,   0],
        [  2,  35]],

       [[196,   0],
        [  0,  20]],

       [[188,   0],
        [  0,  28]],

       [[193,   0],
        [  1,  22]]], dtype=int64)

In [90]:
accuracy_score(ytrue, yhat)

0.9814814814814815

In [None]:
#Test in Real time

In [46]:
from scipy import stats

In [None]:
#This script captures video from a webcam and performs real-time detection and prediction of American Sign Language (ASL) gestures using the 
#Mediapipe holistic model and a trained LSTM model. It continuously records 30-frame sequences of keypoint data from the detected poses and 
#feeds them into the LSTM model to predict the corresponding gesture. If the predicted gesture remains consistent for the last 10 predictions 
#and surpasses a certain threshold, it updates the displayed sentence accordingly. Press 'q' to exit the program.

In [95]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.8

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.8, min_tracking_confidence=0.8) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            #image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti