In [197]:
import cv2 as cv
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
import speech_recognition as sr
import google.generativeai as genai
from dotenv import load_dotenv
import os
import time

In [199]:
print("Current Working Directory:", os.getcwd())

load_dotenv()
IMAGE_SIZE = (64, 64)  
CSV_FILE = 'hand_gesture_data.csv'
MODEL_FILE = 'hand_gesture_model.h5'

In [201]:
def speechrec():
    genai.configure(api_key=os.environ["API_KEY"])
    model=genai.GenerativeModel("gemini-1.5-flash",
                generation_config=genai.GenerationConfig(
                    max_output_tokens=300,
                    temperature=0.6
                ))
    drone_instructions=model.start_chat(history=[])
    drone_instructions.send_message(
        """
        ###CONTEXT###
        You are to responsible for recognizing individual voice instructions to control 
        2 drones,
        which may be referred to as left drone (drone 1) or right drone (drone 2),
        which you will be fed as the output text of a speech recognizer.
        Keep output short, in the prescribed format:
        DESIRED_FORMAT: INSTRUCTIONS FOR DRONE 1: INSTRUCTIONS \n 
        INSTRUCTIONS FOR DRONE 2: INSTRUCTIONS
        Furthermore, you may be asked a SIMPLE YES OR NO QUESTION, which does
        not necessarily have anything todo with drones.
        IN THIS CASE:
        DESIRED_FORMAT: YES (OR) NO
        """
    )
    while True:
        r = sr.Recognizer()
        r.energy_threshold+=2000
        cont=int(input("Press 1 to continue or 0 to break:\n "))
        if cont==1:
            with sr.Microphone() as source:
                r.adjust_for_ambient_noise(source)
                print("Give instructions")
                audio = r.listen(source)
            try:
                rec_out = r.recognize_google(audio)
                error={'drawn':'Drone','up':'UP','down':'DOWN','left':'LEFT',
                       'right':'RIGHT','forward':'FORWARD','backward':'BACKWARD',
                       'and':'AND'}
                for i, j in error.items():
                    rec_out=rec_out.replace(i,j)
                print(drone_instructions.send_message(
                    """###INSTRUCTION###
                    These are the instructions for the drones or the yes/no question
                    you must answer: 
                    """+
                    rec_out
                    ).text)
            except sr.UnknownValueError:
                print("Couldn't understand audio,sorry")
        elif cont==0:
            break
        else:
            print("please try again")


In [203]:
def capture_images(gesture_name, num_images=200):
    cap = cv.VideoCapture(0)
    images = []
    print(f"Press 's' to start capturing {num_images} images for gesture: {gesture_name}. Press 'q' to stop early.")
    
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        cv.imshow('Capture Hand Gesture', cv.flip(frame,1))
        key = cv.waitKey(1) & 0xFF
        if key == ord('s'):  
            break
        elif key == ord('q'):  
            cap.release()
            cv.destroyAllWindows()
            return

    
    while len(images) < num_images:
        ret, frame = cap.read()
        if not ret:
            break
        
 
        gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        resized = cv.resize(gray, IMAGE_SIZE)
        
 
        cv.imshow('Capture Hand Gesture', cv.flip(frame,1))
        
        images.append(resized.flatten()) 
        
        
        time.sleep(0.25)
        
        
        if cv.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv.destroyAllWindows()
    

    df = pd.DataFrame(images)
    df['label'] = gesture_name
    df.to_csv(CSV_FILE, mode='a', header=not pd.io.common.file_exists(CSV_FILE), index=False)
    print(f"Saved {len(images)} images to {CSV_FILE}")

In [205]:
def load_data():
    df = pd.read_csv(CSV_FILE)
    X = df.iloc[:, :-1].values  
    y = df.iloc[:, -1].values   
    

    X = X.reshape(-1, IMAGE_SIZE[0], IMAGE_SIZE[1], 1)
    
    
    X = X / 255.0
    
    
    y = pd.get_dummies(y).values  
    
    return X, y

In [None]:
def train_model():
    X, y = load_data()
    
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Conv2D(64,(5,5),activation='relu'),
        layers.MaxPooling2D((2,2))
        layers.Conv2D(32,(3,3),activation='relu'),
        layers.Conv2D(32,(3,3),activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(16,activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(y.shape[1], activation='softmax')  
    ])
    
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    
    model.fit(X, y, epochs=20, batch_size=16, validation_split=0.2)
    
    model.save(MODEL_FILE)
    print(f"Model saved to {MODEL_FILE}")

In [209]:
def recognize_gesture():
    
    model = tf.keras.models.load_model(MODEL_FILE)
    df = pd.read_csv(CSV_FILE)
    label_names = df['label'].unique()
    cap = cv.VideoCapture(0)
    print("Recognizing gestures. Press 'q' to quit.")
    print(label_names)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
    
        gray = cv.cvtColor(cv.flip(frame,1), cv.COLOR_BGR2GRAY)
        resized = cv.resize(gray, IMAGE_SIZE)
        normalized = resized / 255.0
        input_data = normalized.reshape(1, IMAGE_SIZE[0], IMAGE_SIZE[1], 1)
        
    
        predictions = model.predict(input_data)
        print(predictions)
        max=0.
        if np.argmax(predictions)>=max:
            predicted_label = label_names[np.argmax(predictions)]
        
    
        text= cv.putText(cv.flip(frame,1), f"Gesture: {predicted_label}", (10, 30), cv.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
        cv.imshow('Recognize Hand Gesture', text)
        
    
        if cv.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv.destroyAllWindows()

In [None]:
while True:
    print("CHOOSE: 1 for capturing images and training the model, 2 for recognizing gestures, 3 to recognize speech, 4 to exit....")
    choice=int(input())
    if choice==1:
        
        gesture_name = input("Enter the name of the gesture: ")
        capture_images(gesture_name)
        train_model()
    if choice==2:
       
        recognize_gesture()
    if choice==3:
        speechrec()
    if choice==4:
        break
    