In [1]:
import os
import subprocess
import cv2
import yt_dlp
from ffpyplayer.player import MediaPlayer
from google.cloud import speech
from queue import Queue
import threading
import time

# Set Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/anaishadas/Desktop/theoffice/seismic-rarity-427422-p7-ab3b4a8726ef.json"

def transcribe_audio_stream(audio_url, transcription_queue):
    """Stream audio for transcription using Google Cloud Speech-to-Text."""
    client = speech.SpeechClient()

    # Use ffmpeg to convert audio stream to raw PCM data
    ffmpeg_command = [
        "ffmpeg", "-i", audio_url, "-f", "s16le", "-ac", "1", "-ar", "16000",
        "-loglevel", "quiet", "pipe:1"
    ]
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    speaker_diarization_config = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=True,
        min_speaker_count=2,  # Set minimum number of speakers
        max_speaker_count=2,  # Adjust max speakers based on expected number of speakers
    )

    streaming_config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        diarization_config=speaker_diarization_config,
    )
    streaming_request = speech.StreamingRecognitionConfig(config=streaming_config, interim_results=True)

    def audio_generator():
        while True:
            data = process.stdout.read(4096)
            if not data:
                break
            yield data

    requests = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in audio_generator())
    responses = client.streaming_recognize(config=streaming_request, requests=requests)

    # The transcript within each result is separate and sequential per result.
    # However, the words list within an alternative includes all the words
    # from all the results thus far. Thus, to get all the words with speaker
    # tags, you only have to take the words list from the last result
    try:
        current_speaker_tag = None  # Variable to track the current speaker
        final_transcription = []  # List to accumulate words in the final transcription

        for response in responses:
            result = response.results[-1]  # Get the latest result
            words_info = result.alternatives[0].words  # Extract words info

            # Process each word and track speaker change
            for word_info in words_info:
                # If speaker changes, append the previous speaker's transcription
                if current_speaker_tag != word_info.speaker_tag:
                    if current_speaker_tag is not None:
                        final_transcription.append(f"[Speaker {current_speaker_tag}]")  # Mark the change
                    current_speaker_tag = word_info.speaker_tag  # Update the current speaker

                final_transcription.append(f"'{word_info.word}'")  # Add the word to the transcription
            

            # Handle final transcriptions
            if result.is_final:
                transcription = " ".join(final_transcription)
                timestamp = time.time()  # Record when the transcription was generated
                transcription_queue.put((timestamp, transcription))
                final_transcription = []  # Reset for the next chunk of final transcription

    except Exception as e:
        print(f"Transcription error: {e}")
    finally:
        process.terminate()  # Ensure process is terminated after handling exceptions

def play_video_with_audio_and_transcription(video_url):
    """Play video with synchronized audio and perform real-time transcription."""
    # yt-dlp options to fetch the best video URL
    ydl_opts = {
        "format": "best",            # Fetch the best video + audio format
        "quiet": True,               # Suppress output
        "no_warnings": True          # Suppress warnings
    }

    # Fetch video info and get the stream URL
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)
        video_stream_url = info["url"]

    # Fetch audio URL
    audio_opts = {"format": "bestaudio/best", "quiet": True, "no_warnings": True}
    with yt_dlp.YoutubeDL(audio_opts) as ydl:
        audio_info = ydl.extract_info(video_url, download=False)
        audio_stream_url = audio_info["url"]

    # Initialize OpenCV video capture
    cap = cv2.VideoCapture(video_stream_url)
    if not cap.isOpened():
        print("Error: Cannot open video stream.")
        return

    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frame rate
    frame_delay = int(1000 / fps)  # Delay between frames in milliseconds

    # Initialize ffpyplayer for audio
    player = MediaPlayer(video_stream_url)

    # Queue for synchronized transcription
    transcription_queue = Queue()

    # Start transcription in a background thread
    transcription_thread = threading.Thread(
        target=transcribe_audio_stream, args=(audio_stream_url, transcription_queue)
    )
    transcription_thread.start()

    print("Press 'q' to quit the video stream.")
    while True:
        ret, frame = cap.read()
        if not ret:
            print("End of stream or cannot fetch frame.")
            break

        # Display video frame
        cv2.imshow('YouTube Video Stream', frame)

        # Play audio synchronously
        audio_frame, val = player.get_frame()
        if val != 'eof' and audio_frame:
            _, timestamp = audio_frame

            # Synchronize and display transcription
            while not transcription_queue.empty():
                transcription_time, transcription = transcription_queue.queue[0]
                if transcription_time <= time.time():  # Check if it's time to display the transcription
                    print("[Transcript]:", transcription)
                    transcription_queue.get()
                else:
                    break

        # Exit on pressing 'q'
        if cv2.waitKey(frame_delay) & 0xFF == ord('q'):
            break

    # Release resources
    cap.release()
    player.close_player()
    cv2.destroyAllWindows()

    # Wait for transcription thread to completeq
    transcription_thread.join()

# Replace with your YouTube video URL
youtube_url = "https://www.youtube.com/watch?v=96Y6mc3C1Bg"  # Example video
play_video_with_audio_and_transcription(youtube_url)


Press 'q' to quit the video stream.


[swscaler @ 0x111220000] [swscaler @ 0x111230000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x111250000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x111260000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x111270000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x111280000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x111290000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x1112a0000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x1112b0000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x111220000] [swscaler @ 0x1112c0000] No accelerated colorsp

[Transcript]: 'you' 'and' 'I' 'love' 'crime' 'junkies' 'love' 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips'
[Transcript]: 'you' 'and' 'I' 'love' 'crime' 'junkies' 'love' 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips' 'we're' 'obsessed'
[Transcript]: 'you' 'and' 'I' 'love' 'crime' 'junkies' 'love' 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips' 'we're' 'obsessed' 'yes'
[Transcript]: 'you' 'and' 'I' 'love' 'crime' 'junkies' [Speaker 1] 'love' [Speaker 2] 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips' 'we're' 'obsessed' [Speaker 1] 'yes' [Speaker 2] 'so' 'true' 'crime' 'is' 'a' 'huge' 'Niche' 'I' 'feel' 'like'
[Transcript]: 'you' 'and' 'I' 'love' 'crime' 'junkies' [Speaker 1] 'love' [Speaker 2] 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips' 'we're' 'obsessed' [Speaker 1] '

: 

: 

In [11]:
player.close_player()

NameError: name 'player' is not defined

In [3]:
ydl_opts = {
        "format": "best",            # Fetch the best video + audio format
        "quiet": True,               # Suppress output
        "no_warnings": True          # Suppress warnings
    }

video_url = "https://www.youtube.com/watch?v=96Y6mc3C1Bg"


with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)
        video_stream_url = info["url"]

In [5]:
audio_opts = {"format": "bestaudio/best", "quiet": True, "no_warnings": True}
with yt_dlp.YoutubeDL(audio_opts) as ydl:
    audio_info = ydl.extract_info(video_url, download=False)
    audio_stream_url = audio_info["url"]

In [6]:
print(audio_stream_url)

https://rr5---sn-o097znzr.googlevideo.com/videoplayback?expire=1741531518&ei=HlXNZ42PG7qQsfIPtsCYMQ&ip=2601%3A644%3A9081%3A6680%3A41c8%3A9ff5%3A7880%3A2892&id=o-AHNyhy0DGkwnek4nhUUjwUGg8ao6Q27MtkHGlQBxteXg&itag=251&source=youtube&requiressl=yes&xpc=EgVo2aDSNQ%3D%3D&met=1741509918%2C&mh=KN&mm=31%2C26&mn=sn-o097znzr%2Csn-a5mekndz&ms=au%2Conr&mv=m&mvi=5&pl=46&rms=au%2Cau&initcwndbps=4897500&bui=AUWDL3wFd5WMGGfh4ZTjHsR50Tf6-fLND7dijFX11tILrfktZ9Tv04J_l9REGuOjUAWeSY-A41Mi5PIj&vprv=1&svpuc=1&mime=audio%2Fwebm&ns=FeHJZbs1tv_HL3haTRxZLuoQ&rqh=1&gir=yes&clen=75946061&dur=5104.141&lmt=1729252675319032&mt=1741509436&fvip=2&keepalive=yes&lmw=1&fexp=51326932%2C51410171%2C51411872&c=TVHTML5&sefc=1&txp=5532434&n=156yqHv0n90FWw&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cxpc%2Cbui%2Cvprv%2Csvpuc%2Cmime%2Cns%2Crqh%2Cgir%2Cclen%2Cdur%2Clmt&sig=AJfQdSswRgIhAP1Xo5v1wV1qzjFCPH71iaPey6zfyKbVF7wsr3d7_rJTAiEAsg7BJ9RO-eyr0UR0FldBGzmDAnrPUvO1_P_-a9X9VeM%3D&lsparams=met%2Cmh%2Cmm%2Cmn%2Cms%2Cmv%

In [4]:
%pip install ffpyplayer

Collecting ffpyplayer
  Downloading ffpyplayer-4.5.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
Downloading ffpyplayer-4.5.2-cp39-cp39-macosx_11_0_arm64.whl (18.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: ffpyplayer
Successfully installed ffpyplayer-4.5.2
Note: you may need to restart the kernel to use updated packages.


In [None]:

from google.cloud import speech


