# Describe My Environment - Interactive Demo

This notebook demonstrates the **Describe My Environment** system.

**Features:**
1. **Reflex Loop**: Low-latency object tracking and hazard detection (YOLO11).
2. **Cognitive Loop**: Scene understanding (BLIP) and narration (Llama 3.2).
3. **Audio Feedback**: Text-to-speech narration and hazard warnings.

You can choose to run a **Static Test** (single image) or a **Live Camera Demo**.

In [9]:
# Setup and Imports
import os
import sys
import cv2
import time
import threading
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

# Add project root to path
project_root = os.path.abspath(os.getcwd())
if project_root not in sys.path:
    sys.path.append(project_root)

# Import project modules
from src.reflex_loop.tracker import YOLOTracker
from src.reflex_loop.safety import SafetyMonitor
from src.cognitive_loop.scene_composer import SceneComposer
from src.cognitive_loop.narrator import LLMNarrator
from src.cognitive_loop.history import HistoryBuffer
from src.hardware.audio import AudioHandler
from src.config import GLOBAL_WARNING_COOLDOWN

print("‚úÖ Imports complete")

‚úÖ Imports complete


In [10]:
def run_image_test():
    print("\n--- Running Static Image Test ---")
    
    # Initialize Components
    print("‚è≥ Initializing components...")
    tracker = YOLOTracker(model_path="yolo11n.pt")
    scene_composer = SceneComposer()
    narrator = LLMNarrator()
    audio_handler = AudioHandler()
    
    # Load Image
    image_path = os.path.join("test_images", "test_image_0.jpg")
    if not os.path.exists(image_path):
        print(f"‚ùå Image not found: {image_path}")
        return
        
    frame = cv2.imread(image_path)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # 1. Tracking
    detections = tracker.detect(frame)
    print(f"‚úÖ Detected {len(detections)} objects")
    
    # Visualize
    annotated_frame = frame_rgb.copy()
    for det in detections:
        x1, y1, x2, y2 = det.box
        label = f"{det.class_name} {det.confidence:.2f}"
        cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(annotated_frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    plt.figure(figsize=(10, 6))
    plt.imshow(annotated_frame)
    plt.axis('off')
    plt.title("Detections")
    plt.show()
    
    # 2. Scene Description
    caption = scene_composer.generate_scene_description(frame)
    print(f"\nüìù Scene: {caption}")
    
    # 3. Narration
    # Prepare strings for compose_prompt
    object_descriptions = [
        f"{d.class_name} (conf: {d.confidence:.2f})" for d in detections
    ]
    
    print("\nü§ñ Generating Narration...")
    
    # Use compose_prompt to create the string prompt for generate_narration
    prompt = narrator.compose_prompt(caption, object_descriptions)
    narration = narrator.generate_narration(prompt)
    
    if narration:
        print(f"\nüó£Ô∏è FINAL NARRATION: {narration}")
        audio_handler.speak_text(narration)
    else:
        print("‚ö†Ô∏è Could not generate narration (check Ollama)")
        
    # Allow time for audio to play before cleanup
    time.sleep(5)
    audio_handler.stop()

In [11]:
def run_camera_demo():
    print("\n--- Running Live Camera Demo ---")
    print("‚è≥ Initializing pipeline components...")
    
    tracker = YOLOTracker(model_path="yolo11n.pt")
    scene_composer = SceneComposer()
    narrator = LLMNarrator()
    safety_monitor = SafetyMonitor()
    history_buffer = HistoryBuffer()
    audio_handler = AudioHandler()
    
    if not narrator.check_connection():
        print("‚ö†Ô∏è Ollama is not running. Narration will be disabled.")
    
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("‚ùå Could not open camera.")
        return

    print("‚úÖ Camera started.")
    print("Commands:")
    print(" [Space] - Generate scene narration")
    print(" [q]     - Quit")

    frame_count = 0
    last_hazard_time = 0
    
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
                
            timestamp = time.time()
            frame_count += 1
            
            # 1. Reflex Loop: Tracking
            detections, annotated_frame = tracker.track(frame, frame_count, timestamp, return_annotated=True)
            
            # Update History
            for det in detections:
                track_id = det.track_id if det.track_id is not None else -1
                history_buffer.add_detection(track_id, det)
            
            history_buffer.cleanup_stale_objects(frame_count)
            
            # 2. Hazard Detection
            hazards = safety_monitor.check_hazards(detections, history_buffer)
            if safety_monitor.should_warn(hazards):
                current_time = time.time()
                # Add cooldown to prevent spamming the audio queue
                if current_time - last_hazard_time > GLOBAL_WARNING_COOLDOWN:
                    warning = safety_monitor.get_warning_message(hazards)
                    print(f"‚ö†Ô∏è HAZARD: {warning}")
                    audio_handler.play_beep()
                    audio_handler.speak_text(warning, priority=True)
                    last_hazard_time = current_time
                
                # Visual warning always shows
                cv2.putText(annotated_frame, "HAZARD DETECTED", (50, 50), 
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)

            # Show Frame
            cv2.imshow("Describe My Environment - Demo", annotated_frame)
            
            # Input Handling
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == 32:  # Spacebar for narration
                print("\nüì∏ Creating narration...")
                
                # Run narration logic
                caption = scene_composer.generate_scene_description(frame)
                print(f"   üìù BLIP Caption: {caption}")
                
                # Format objects for the prompt
                object_descriptions = [
                    f"{d.class_name} at box {d.box}" for d in detections
                ]
                if hazards:
                    hazard_descriptions = [str(h.reason) for h in hazards]
                    object_descriptions.append(f"Hazards: {hazard_descriptions}")
                
                # Compose the single string prompt
                prompt = narrator.compose_prompt(caption, object_descriptions)
                print(f"   üì§ Sending Prompt to Llama: \n{'-'*20}\n{prompt}\n{'-'*20}")
                
                # Generate narration
                print("ü§î Thinking...")
                narration = narrator.generate_narration(prompt)
                
                if narration:
                    print(f"üó£Ô∏è {narration}")
                    audio_handler.speak_text(narration)
                
    except KeyboardInterrupt:
        pass
    finally:
        cap.release()
        cv2.destroyAllWindows()
        audio_handler.stop()
        print("üõë Stopped.")

In [12]:
# Main Execution
print("Select Mode:")
print("1. Static Test Image")
print("2. Live Camera Demo")

choice = input("Enter choice (1 or 2): ")

if choice == '2':
    run_camera_demo()
else:
    run_image_test()

Select Mode:
1. Static Test Image
2. Live Camera Demo

--- Running Live Camera Demo ---
‚è≥ Initializing pipeline components...
‚úÖ Camera started.
Commands:
 [Space] - Generate scene narration
 [q]     - Quit

üì∏ Creating narration...
   üìù BLIP Caption: a man in a black shirt and headphones
   üì§ Sending Prompt to Llama: 
--------------------
SYSTEM: You are a helpful assistant for a blind user. Be concise and direct. Only describe what is certainly present. Do not ask questions.
If the context mentions a "mirror" or "reflection" and it seems to be describing the user themselves (e.g., "standing in front of a mirror"), assume it is a hallucination caused by the camera feed and describe it as the person being present or facing the camera.

USER:
Context: "a man in a black shirt and headphones"
Entities (detected movement):
- person at box (627, 308, 1575, 1076)

TASK: Synthesize the context and entities into one natural sentence.
IMPORTANT RULES:
1. The entities likely correspon