In [None]:
import sys, os
import subprocess
import cv2
import yt_dlp
from ffpyplayer.player import MediaPlayer
from google.cloud import speech
from queue import Queue
import threading
import time
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

# Set Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/indrajababu/Downloads/seismic-rarity-427422-p7-ab3b4a8726ef.json"

# Global variables for face detection and speaker tracking
detection_results = []  # Stores face detection results
current_speaker_tag = None  # Tracks the current speaker
speaker_faces = {}  # Maps speaker tags to face positions
face_detector = None  # Will hold our face detector instance

def initialize_face_detector():
    """Initialize the MediaPipe face detector with proper error handling."""
    global face_detector
    try:
        model_path = "detector.tflite"  # Update this path as needed
        
        # Check if model file exists
        if not os.path.exists(model_path):
            print(f"Error: Model file not found at {model_path}")
            return None

        BaseOptions = mp.tasks.BaseOptions
        FaceDetector = mp.tasks.vision.FaceDetector
        FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
        VisionRunningMode = mp.tasks.vision.RunningMode

        options = FaceDetectorOptions(
            base_options=BaseOptions(model_asset_path=model_path),
            running_mode=VisionRunningMode.LIVE_STREAM,
            result_callback=print_result
        )

        face_detector = vision.FaceDetector.create_from_options(options)
        return face_detector
    except Exception as e:
        print(f"Error initializing face detector: {e}")
        return None

def print_result(result, output_image, timestamp_ms):
    """Callback function for face detection results."""
    global detection_results
    try:
        detection_results = result.detections if result.detections else []
    except Exception as e:
        print(f"Error processing face detection results: {e}")
        detection_results = []

def visualize(image, detections, speaker_tag=None):
    """Draw bounding boxes on the image with special highlighting for current speaker."""
    try:
        annotated_image = image.copy()
        height, width, _ = image.shape
        
        for detection in detections:
            bbox = detection.bounding_box
            start_point = (bbox.origin_x, bbox.origin_y)
            end_point = (bbox.origin_x + bbox.width, bbox.origin_y + bbox.height)
            
            # Default color (green) for non-speaking faces
            color = (0, 255, 0)
            thickness = 2
            
            # If we have speaker information and want to highlight current speaker
            if speaker_tag is not None:
                # Store face position for this speaker (simple approach - just use the first face)
                speaker_faces[speaker_tag] = (start_point, end_point)
                
                # If this is the current speaker, change color to red
                if speaker_tag == current_speaker_tag:
                    color = (0, 0, 255)
                    thickness = 3
            
            # Draw the bounding box
            cv2.rectangle(annotated_image, start_point, end_point, color, thickness)
        
        return annotated_image
    except Exception as e:
        print(f"Error in visualization: {e}")
        return image

def transcribe_audio_stream(audio_url, transcription_queue):
    """Stream audio for transcription using Google Cloud Speech-to-Text."""
    global current_speaker_tag
    
    client = speech.SpeechClient()

    # Use ffmpeg to convert audio stream to raw PCM data
    ffmpeg_command = [
        "ffmpeg", "-i", audio_url, "-f", "s16le", "-ac", "1", "-ar", "16000",
        "-loglevel", "quiet", "pipe:1"
    ]
    
    diarization_config = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=True,
        min_speaker_count=2,
        max_speaker_count=10,
    )
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    streaming_config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        diarization_config=diarization_config
    )
    streaming_request = speech.StreamingRecognitionConfig(config=streaming_config, interim_results=True)

    def audio_generator():
        while True:
            data = process.stdout.read(4096)
            if not data:
                break
            yield data

    requests = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in audio_generator())
    
    try:
        responses = client.streaming_recognize(config=streaming_request, requests=requests)
        
        for response in responses:
            if not response.results:
                continue
                
            result = response.results[-1]  # Get the latest result
            
            if not result.alternatives:
                continue
                
            # Only process if we have word-level info with speaker tags
            if hasattr(result.alternatives[0], 'words') and result.alternatives[0].words:
                local_speaker_tag = None
                final_transcription = [[]]
                
                words_info = result.alternatives[0].words
                
                # Process each word and track speaker change
                for word_info in words_info:
                    # If speaker changes, append the previous speaker's transcription
                    if local_speaker_tag is None or word_info.speaker_tag != local_speaker_tag:
                        if local_speaker_tag is not None:
                            transcription = " ".join(final_transcription[-1])
                            timestamp = time.time()
                            transcription_queue.put((timestamp, f"Speaker {local_speaker_tag}: {transcription}"))
                            current_speaker_tag = local_speaker_tag  # Update global speaker tag
                        
                        final_transcription.append([])
                        local_speaker_tag = word_info.speaker_tag
                    
                    final_transcription[-1].append(word_info.word)
                
                # Add the last speaker's transcription
                if local_speaker_tag is not None and final_transcription[-1]:
                    transcription = " ".join(final_transcription[-1])
                    timestamp = time.time()
                    transcription_queue.put((timestamp, f"Speaker {local_speaker_tag}: {transcription}"))
                    current_speaker_tag = local_speaker_tag  # Update global speaker tag
                    
            elif result.is_final and result.alternatives[0].transcript:
                # Fallback for when speaker diarization isn't available
                timestamp = time.time()
                transcription_queue.put((timestamp, result.alternatives[0].transcript))

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(f"Error in transcription: {exc_type}, {fname}, line {exc_tb.tb_lineno}")
        print(f"Exception details: {e}")
    finally:
        process.terminate()

def play_video_with_audio_and_transcription(video_url):
    """Play video with synchronized audio and perform real-time transcription."""
    global detection_results, current_speaker_tag, face_detector
    
    try:
        # First, list available formats to debug
        print("Listing available formats:")
        with yt_dlp.YoutubeDL({"listformats": True}) as ydl:
            ydl.extract_info(video_url, download=False)
        
        # yt-dlp options with more flexible format selection
        ydl_opts = {
            "format": "bestvideo[height<=720]+bestaudio/best[height<=720]",
            "quiet": True,
            "no_warnings": True
        }

        # Fetch video info
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=False)
            
            # Handle different info structures
            if 'url' in info:
                video_stream_url = info['url']
            elif 'requested_formats' in info:
                for fmt in info['requested_formats']:
                    if fmt.get('vcodec', 'none') != 'none':
                        video_stream_url = fmt['url']
                        break
                else:
                    raise KeyError("No video URL found in requested formats")
            else:
                raise KeyError("Could not find URL in the extracted information")

        # Fetch audio URL with similar approach
        audio_opts = {
            "format": "bestaudio/best",
            "quiet": True,
            "no_warnings": True
        }
        
        with yt_dlp.YoutubeDL(audio_opts) as ydl:
            audio_info = ydl.extract_info(video_url, download=False)
            
            if 'url' in audio_info:
                audio_stream_url = audio_info['url']
            elif 'requested_formats' in audio_info:
                for fmt in audio_info['requested_formats']:
                    if fmt.get('acodec', 'none') != 'none':
                        audio_stream_url = fmt['url']
                        break
                else:
                    audio_stream_url = video_stream_url
            else:
                audio_stream_url = video_stream_url

        print(f"Video URL: {video_stream_url[:50]}...")
        print(f"Audio URL: {audio_stream_url[:50]}...")

        # Initialize OpenCV video capture
        cap = cv2.VideoCapture(video_stream_url)
        if not cap.isOpened():
            print("Error: Cannot open video stream.")
            return

        fps = cap.get(cv2.CAP_PROP_FPS)
        if fps <= 0:
            fps = 30
        frame_delay = int(1000 / fps)

        # Initialize ffpyplayer for audio
        player = MediaPlayer(video_stream_url)

        # Queue for synchronized transcription
        transcription_queue = Queue()

        # Start transcription in a background thread
        transcription_thread = threading.Thread(
            target=transcribe_audio_stream, args=(audio_stream_url, transcription_queue)
        )
        transcription_thread.daemon = True
        transcription_thread.start()

        # Initialize face detector
        face_detector = initialize_face_detector()
        if face_detector is None:
            print("Warning: Face detection disabled due to initialization error")

        print("Press 'q' to quit the video stream.")
        while True:
            ret, frame = cap.read()
            if not ret:
                print("End of stream or cannot fetch frame.")
                break

            # Convert frame to RGB if needed (MediaPipe expects RGB)
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Detect faces if detector is available
            if face_detector is not None:
                try:
                    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)
                    face_detector.detect_async(mp_image, int(time.time() * 1000))
                except Exception as e:
                    print(f"Error in face detection: {e}")

            # Visualize faces with speaker highlighting
            try:
                annotated_frame = visualize(frame, detection_results, current_speaker_tag)
            except Exception as e:
                print(f"Error in visualization: {e}")
                annotated_frame = frame
            
            # Display video frame
            cv2.imshow('YouTube Video Stream', annotated_frame)

            # Play audio synchronously
            audio_frame, val = player.get_frame()
            if val != 'eof' and audio_frame:
                # Check for and display transcriptions
                while not transcription_queue.empty():
                    transcription_time, transcription = transcription_queue.queue[0]
                    if transcription_time <= time.time():
                        print(f"Transcript: {transcription}")
                        transcription_queue.get()
                    else:
                        break

            # Exit on pressing 'q'
            if cv2.waitKey(frame_delay) & 0xFF == ord('q'):
                break

        # Release resources
        cap.release()
        player.close_player()
        cv2.destroyAllWindows()
        if face_detector is not None:
            face_detector.close()

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(f"Error in video playback: {exc_type}, {fname}, line {exc_tb.tb_lineno}")
        print(f"Exception details: {e}")

def terminate_script():
    """Terminates the current Python process."""
    os._exit(0)

if __name__ == "__main__":
    # Replace with your YouTube video URL
    youtube_url = "https://www.youtube.com/watch?v=96Y6mc3C1Bg"  # Example video
    
    try:
        play_video_with_audio_and_transcription(youtube_url)
    except KeyboardInterrupt:
        print("\nScript terminated by user.")
        terminate_script()


In [None]:

# Install dependencies
conda install -c conda-forge wget opencv pytorch torchvision cudatoolkit