In [90]:
!pip install torch transformers sentence-transformers scikit-learn pandas opencv-python moviepy mediapipe



In [91]:
import os
import cv2
import mediapipe as mp
from moviepy.editor import VideoFileClip
from transformers import pipeline

This section initializes and loads the three core AI models that form the backbone of our multimodal system. Each model is responsible for a different modality: speech, vision, and language.
1.  **Whisper**: A state-of-the-art speech-to-text model from OpenAI for transcribing spoken words.
2.  **MediaPipe Hands**: A computer vision model from Google for detecting hand landmarks in real-time.
3.  **Zero-Shot Classifier**: A powerful NLP model (BART) that can classify text into predefined categories (intents) without being explicitly trained on them.
Using a GPU (`device=0`) is specified to significantly speed up model inference.

In [92]:
# 1. Speech-to-Text Model (Whisper)
# Using a GPU (device=0) is highly recommended for Whisper
stt_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=0)
print("--> Whisper Speech-to-Text model loaded.")

# 2. Hand Gesture Model (MediaPipe)
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils
print("--> MediaPipe Hand Gesture model loaded.")

# 3. ZERO-SHOT TEXT-TO-INTENT NLP Model
# We replace our custom classifier with a powerful pre-trained model.
# facebook/bart-large-mnli is a popular choice for this task.
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)
# Define our possible intents which will be the candidate labels
CANDIDATE_INTENTS = ["forward", "left", "right", "stop"]
print("--> Zero-Shot Intent NLP model loaded.")
print("\n" + "="*50 + "\nAll models are ready.\n" + "="*50)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cuda:0


--> Whisper Speech-to-Text model loaded.
--> MediaPipe Hand Gesture model loaded.


Device set to use cuda:0


--> Zero-Shot Intent NLP model loaded.

All models are ready.


This function takes a string of text (the transcript from the audio) and uses the pre-trained zero-shot classification model to determine which of the `CANDIDATE_INTENTS` it most closely matches. It works "zero-shot," meaning the model was not specifically trained on our "forward," "left," "right," or "stop" commands but can generalize to understand them. The function only returns an intent if the model's confidence score exceeds a specified threshold, preventing uncertain classifications.

In [93]:
def get_intent_from_video(video_path):
    """
    Analyzes a video for hand gesture SEQUENCE:
    First: All 3 directional gestures (left, right, forward) in any order
    Then: Stop gesture to complete the sequence
    """
    print("\n[Video] Analyzing video for gesture sequence...")
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened(): return None

    gesture_counts = {"left": 0, "right": 0, "forward": 0, "stop": 0, "unknown": 0}
    gesture_sequence = []  # Track order of gestures detected
    frame_count = 0
    last_gesture = None
    gesture_hold_frames = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break

        if frame_count % 5 == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(frame_rgb)

            current_gesture = None

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Collect key landmarks
                    thumb_tip = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP]
                    thumb_ip = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_IP]

                    index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
                    index_pip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_PIP]

                    middle_tip = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
                    middle_pip = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_PIP]

                    ring_tip = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_TIP]
                    ring_pip = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_PIP]

                    pinky_tip = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_TIP]
                    pinky_pip = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_PIP]

                    wrist = hand_landmarks.landmark[mp_hands.HandLandmark.WRIST]

                    # 1. Condition for Left/Right Fist
                    index_folded = index_tip.y > index_pip.y
                    middle_folded = middle_tip.y > middle_pip.y
                    ring_folded = ring_tip.y > ring_pip.y
                    pinky_folded = pinky_tip.y > pinky_pip.y

                    is_fist_with_thumb = (
                        index_folded and middle_folded and ring_folded and pinky_folded and
                        abs(thumb_tip.y - thumb_ip.y) < 0.05
                    )

                    # 2. Condition for Stop (Open Palm)
                    fingers_open = (
                        index_tip.y < index_pip.y and
                        middle_tip.y < middle_pip.y and
                        ring_tip.y < ring_pip.y and
                        pinky_tip.y < pinky_pip.y and
                        thumb_tip.y < thumb_ip.y
                    )

                    # 3. Condition for Forward (Thumbs Up)
                    is_thumbs_up = (
                        thumb_tip.y < thumb_ip.y - 0.03 and
                        index_folded and middle_folded and ring_folded and pinky_folded
                    )

                    # PRIORITY 1: Check for Left/Right Fist
                    if is_fist_with_thumb:
                        if thumb_tip.x < wrist.x - 0.04:
                            current_gesture = "left"
                        elif thumb_tip.x > wrist.x + 0.04:
                            current_gesture = "right"
                        else:
                            if is_thumbs_up:
                                current_gesture = "forward"

                    # PRIORITY 2: Check for Stop (Open Palm)
                    elif fingers_open:
                        current_gesture = "stop"

                    # PRIORITY 3: Check for Forward (Thumbs Up)
                    elif is_thumbs_up:
                        current_gesture = "forward"

            # Track gesture stability - REDUCED STABILITY REQUIREMENT
            if current_gesture == last_gesture:
                gesture_hold_frames += 1
            else:
                gesture_hold_frames = 0
                last_gesture = current_gesture

            # Add gesture to sequence if held for 2+ frames (less strict)
            if current_gesture and gesture_hold_frames >= 2:
                if len(gesture_sequence) == 0 or gesture_sequence[-1] != current_gesture:
                    gesture_sequence.append(current_gesture)
                    gesture_counts[current_gesture] += 1
                    print(f"[Video] Detected: {current_gesture} (Sequence: {gesture_sequence})")

        frame_count += 1

    cap.release()

    # Check if we have a valid sequence
    print(f"[Video] Final Gesture Sequence: {gesture_sequence}")
    print(f"[Video] Gesture Counts: {gesture_counts}")

    # Valid sequence: contains left, right, forward (in any order) + ends with stop
    required_gestures = {"left", "right", "forward"}
    detected_gestures = set(gesture_sequence)

    # More lenient completion check - just need all 3 directional + stop somewhere
    has_all_directional = required_gestures.issubset(detected_gestures)
    has_stop = "stop" in detected_gestures

    if has_all_directional and has_stop:
        print("[Video] SUCCESS: Valid gesture sequence detected!")
        return "sequence_complete"
    elif has_all_directional and not has_stop:
        print("[Video] PARTIAL: All directional gestures detected, need STOP...")
        return "sequence_partial"
    else:
        missing = required_gestures - detected_gestures
        print(f"[Video] INCOMPLETE: Missing gestures: {missing}")
        if not has_stop:
            print("[Video] Also missing: STOP gesture")
        return None


def process_multimodal_command(video_path):
    """
    Updated pipeline for sequence-based commands with audio+video
    """
    print(f"\n{'='*20} PROCESSING SEQUENCE COMMAND: {video_path} {'='*20}")
    if not os.path.exists(video_path):
        print(f"Error: Video file not found at {video_path}"); return

    # --- Step 1: Extract Audio & Get Intents ---
    temp_audio_path = "temp_audio.wav"
    audio_commands = []  # Track all audio commands detected

    try:
        with VideoFileClip(video_path) as video_clip:
            video_clip.audio.write_audiofile(temp_audio_path, logger=None)

        # You might want to modify get_intent_from_audio to return all detected commands
        # For now, assuming it returns the most dominant command
        audio_intent = get_intent_from_audio(temp_audio_path)

    except Exception:
        audio_intent = None
    finally:
        if os.path.exists(temp_audio_path): os.remove(temp_audio_path)

    video_intent = get_intent_from_video(video_path)

    # --- Step 2: SEQUENCE DECISION LOGIC ---
    print("\n[Fusion] Analyzing sequence completion...")
    print(f"[Fusion] Audio Intent: {audio_intent} | Video Intent: {video_intent}")

    # Case 1: Complete sequence detected
    if video_intent == "sequence_complete":
        if audio_intent:
            print(f"\nSEQUENCE COMPLETE: Video sequence finished with audio confirmation!")
            print("Executing full command sequence...")
            # Execute your robot sequence here
        else:
            print(f"\nSEQUENCE COMPLETE: Video sequence finished (no audio detected)")
            print("Executing full command sequence...")

    # Case 2: Partial sequence
    elif video_intent == "sequence_partial":
        print(f"\nSEQUENCE PARTIAL: Directional gestures complete, perform STOP gesture to finish")

    # Case 3: Audio only (fallback to original logic)
    elif audio_intent and not video_intent:
        print(f"\nAUDIO ONLY: Single command detected: {audio_intent.upper()}")

    # Case 4: No clear sequence
    else:
        print(f"\nINCOMPLETE: Sequence not detected. Please perform all gestures (left, right, forward) then stop.")

It takes the path to an audio file, uses the Whisper model to transcribe the speech into text, and then passes this text to our `get_intent_from_text_zero_shot` function to determine the final command intent. It includes error handling in case the audio processing fails.

For a finger to be curled, its tip must be "lower" on the screen than its middle joint (the PIP joint). In screen coordinates, a higher y value means lower on the screen. This condition checks if the main fingers are bent downwards.

This is the core function that combines the entire multimodal analysis. It takes a video file path as input and performs the following steps:
1.  Extracts the audio from the video into a temporary file.
2.  Runs the audio processing pipeline to get an `audio_intent`.
3.  Runs the video gesture recognition pipeline to get a `video_intent`.
4.  Decision

In [94]:
if __name__ == "__main__":

    test_videos = [
        "/content/forward.mov",
    ]

    for video_file in test_videos:
        process_multimodal_command(video_file)




[Audio] Transcribing speech to text...
[Audio] Raw Transcription Result: {'text': ' Right. Left.'}
[NLP] Classifying text: 'right. left.'
[NLP] Top classification: 'forward' with confidence: 0.37
[NLP] Confidence is below threshold. Intent is uncertain.

[Video] Analyzing video for gesture sequence...





[Video] Detected: stop (Sequence: ['stop'])
[Video] Detected: forward (Sequence: ['stop', 'forward'])
[Video] Detected: stop (Sequence: ['stop', 'forward', 'stop'])
[Video] Detected: right (Sequence: ['stop', 'forward', 'stop', 'right'])
[Video] Final Gesture Sequence: ['stop', 'forward', 'stop', 'right']
[Video] Gesture Counts: {'left': 0, 'right': 1, 'forward': 1, 'stop': 2, 'unknown': 0}
[Video] INCOMPLETE: Missing gestures: {'left'}

[Fusion] Analyzing sequence completion...
[Fusion] Audio Intent: None | Video Intent: None

INCOMPLETE: Sequence not detected. Please perform all gestures (left, right, forward) then stop.


In [95]:
if __name__ == "__main__":

    test_videos = [
        "/content/stop.mov",
    ]

    for video_file in test_videos:
        process_multimodal_command(video_file)



[Audio] Transcribing speech to text...
[Audio] Raw Transcription Result: {'text': " Let's don't."}
[NLP] Classifying text: 'let's don't.'
[NLP] Top classification: 'stop' with confidence: 0.62
[NLP] Confidence is above threshold. Intent is 'stop'.

[Video] Analyzing video for gesture sequence...





[Video] Detected: stop (Sequence: ['stop'])
[Video] Final Gesture Sequence: ['stop']
[Video] Gesture Counts: {'left': 0, 'right': 0, 'forward': 0, 'stop': 1, 'unknown': 0}
[Video] INCOMPLETE: Missing gestures: {'left', 'forward', 'right'}

[Fusion] Analyzing sequence completion...
[Fusion] Audio Intent: stop | Video Intent: None

AUDIO ONLY: Single command detected: STOP


In [None]:
if __name__ == "__main__":

    test_videos = [
        "/content/forward.mov",
    ]

    for video_file in test_videos:
        process_multimodal_command(video_file)



[Audio] Transcribing speech to text...
[Audio] Raw Transcription Result: {'text': ' Right. Left.'}
[NLP] Classifying text: 'right. left.'
[NLP] Top classification: 'forward' with confidence: 0.37
[NLP] Confidence is below threshold. Intent is uncertain.

[Video] Analyzing video for gesture sequence...





[Video] Detected: stop (Sequence: ['stop'])
[Video] Detected: forward (Sequence: ['stop', 'forward'])
[Video] Detected: stop (Sequence: ['stop', 'forward', 'stop'])
[Video] Detected: right (Sequence: ['stop', 'forward', 'stop', 'right'])
[Video] Final Gesture Sequence: ['stop', 'forward', 'stop', 'right']
[Video] Gesture Counts: {'left': 0, 'right': 1, 'forward': 1, 'stop': 2, 'unknown': 0}
[Video] INCOMPLETE: Missing gestures: {'left'}

[Fusion] Analyzing sequence completion...
[Fusion] Audio Intent: None | Video Intent: None

INCOMPLETE: Sequence not detected. Please perform all gestures (left, right, forward) then stop.
