In [5]:
import os
import subprocess
import cv2
import yt_dlp
from ffpyplayer.player import MediaPlayer
from google.cloud import speech
from queue import Queue
import threading
import time

# Set Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/rgopalam/Desktop/seismic-rarity-427422-p7-ab3b4a8726ef.json"

def transcribe_audio_stream(audio_url, transcription_queue):
    """Stream audio for transcription using Google Cloud Speech-to-Text."""
    client = speech.SpeechClient()

    # Use ffmpeg to convert audio stream to raw PCM data
    ffmpeg_command = [
        "ffmpeg", "-i", audio_url, "-f", "s16le", "-ac", "1", "-ar", "16000",
        "-loglevel", "quiet", "pipe:1"
    ]
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    streaming_config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )
    streaming_request = speech.StreamingRecognitionConfig(config=streaming_config, interim_results=True)

    def audio_generator():
        while True:
            data = process.stdout.read(4096)
            if not data:
                break
            yield data

    requests = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in audio_generator())
    responses = client.streaming_recognize(config=streaming_request, requests=requests)

    try:
        for response in responses:
            for result in response.results:
                if result.is_final:
                    transcription = result.alternatives[0].transcript
                    timestamp = time.time()  # Record when the transcription was generated
                    transcription_queue.put((timestamp, transcription))
    except Exception as e:
        print(f"Transcription error: {e}")
    finally:
        process.terminate()

def play_video_with_audio_and_transcription(video_url):
    """Play video with synchronized audio and perform real-time transcription."""
    # yt-dlp options to fetch the best video URL
    ydl_opts = {
        "format": "best",            # Fetch the best video + audio format
        "quiet": True,               # Suppress output
        "no_warnings": True          # Suppress warnings
    }

    # Fetch video info and get the stream URL
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)
        video_stream_url = info["url"]

    # Fetch audio URL
    audio_opts = {"format": "bestaudio/best", "quiet": True, "no_warnings": True}
    with yt_dlp.YoutubeDL(audio_opts) as ydl:
        audio_info = ydl.extract_info(video_url, download=False)
        audio_stream_url = audio_info["url"]

    # Initialize OpenCV video capture
    cap = cv2.VideoCapture(video_stream_url)
    if not cap.isOpened():
        print("Error: Cannot open video stream.")
        return

    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frame rate
    frame_delay = int(1000 / fps)  # Delay between frames in milliseconds

    # Initialize ffpyplayer for audio
    player = MediaPlayer(video_stream_url)

    # Queue for synchronized transcription
    transcription_queue = Queue()

    # Start transcription in a background thread
    transcription_thread = threading.Thread(
        target=transcribe_audio_stream, args=(audio_stream_url, transcription_queue)
    )
    transcription_thread.start()

    print("Press 'q' to quit the video stream.")
    while True:
        ret, frame = cap.read()
        if not ret:
            print("End of stream or cannot fetch frame.")
            break

        # Display video frame
        cv2.imshow('YouTube Video Stream', frame)

        # Play audio synchronously
        audio_frame, val = player.get_frame()
        if val != 'eof' and audio_frame:
            _, timestamp = audio_frame

            # Synchronize and display transcription
            while not transcription_queue.empty():
                transcription_time, transcription = transcription_queue.queue[0]
                if transcription_time <= time.time():  # Check if it's time to display the transcription
                    print("[Transcript]:", transcription)
                    transcription_queue.get()
                else:
                    break

        # Exit on pressing 'q'
        if cv2.waitKey(frame_delay) & 0xFF == ord('q'):
            break

    # Release resources
    cap.release()
    player.close_player()
    cv2.destroyAllWindows()

    # Wait for transcription thread to complete
    transcription_thread.join()

# Replace with your YouTube video URL
youtube_url = "https://www.youtube.com/watch?v=96Y6mc3C1Bg"  # Example video
play_video_with_audio_and_transcription(youtube_url)


Exception in thread Thread-23:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/mediapipe_env/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/opt/anaconda3/envs/mediapipe_env/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/opt/anaconda3/envs/mediapipe_env/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/lj/b8t255bd50s9w1w_tyqwcx140000gn/T/ipykernel_45922/3476782450.py", line 16, in transcribe_audio_stream
  File "/opt/anaconda3/envs/mediapipe_env/lib/python3.9/site-packages/google/cloud/speech_v1/services/speech/client.py", line 672, in __init__
    self._transport = transport_init(
  File "/opt/anaconda3/envs/mediapipe_env/lib/python3.9/site-packages/google/cloud/speech_v1/services/speech/transports/grpc.py", line 235, in __init__
    super().__init__(
  File "/opt/anaconda3/envs/mediapipe_env/lib/python3

Press 'q' to quit the video stream.


2025-01-22 16:38:39.143 python[45922:6064922] +[IMKClient subclass]: chose IMKClient_Legacy
2025-01-22 16:38:39.143 python[45922:6064922] +[IMKInputSession subclass]: chose IMKInputSession_Legacy
[swscaler @ 0x108620000] [swscaler @ 0x108630000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x108620000] [swscaler @ 0x108640000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x108620000] [swscaler @ 0x108650000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x108620000] [swscaler @ 0x108660000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x108620000] [swscaler @ 0x108670000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x108620000] [swscaler @ 0x108680000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x108620000] [swscaler @ 0x108690000] No accelerated colorspace conversion found from yuv420p to

In [4]:
%pip install ffpyplayer

Collecting ffpyplayer
  Downloading ffpyplayer-4.5.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
Downloading ffpyplayer-4.5.2-cp39-cp39-macosx_11_0_arm64.whl (18.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: ffpyplayer
Successfully installed ffpyplayer-4.5.2
Note: you may need to restart the kernel to use updated packages.
