In [None]:
import sys, os
import subprocess
import cv2
import yt_dlp
from ffpyplayer.player import MediaPlayer
from google.cloud import speech
from queue import Queue
import threading
import time

# Set Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/aditya/Downloads/seismic-rarity-427422-p7-ab3b4a8726ef.json"

def transcribe_audio_stream(audio_url, transcription_queue):
    """Stream audio for transcription using Google Cloud Speech-to-Text."""
    client = speech.SpeechClient()

    # Use ffmpeg to convert audio stream to raw PCM data
    ffmpeg_command = [
        "ffmpeg", "-i", audio_url, "-f", "s16le", "-ac", "1", "-ar", "16000",
        "-loglevel", "quiet", "pipe:1"
    ]
    
    diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=2,
    max_speaker_count=10,
)
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    streaming_config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        diarization_config = diarization_config
    )
    streaming_request = speech.StreamingRecognitionConfig(config=streaming_config, interim_results=True)

    def audio_generator():
        while True:
            data = process.stdout.read(4096)
            if not data:
                break
            yield data

    requests = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in audio_generator())
    responses = client.streaming_recognize(config=streaming_request, requests=requests)
    
    try:
        for response in responses:
            current_speaker_tag = 8000  # Variable to track the current speaker
            final_transcription = [[]]  # List to accumulate transcripts
            
            result = response.results[-1]  # Get the latest result
            words_info = result.alternatives[0].words  # Extract words info

            # Process each word and track speaker change
            for word_info in words_info:

                # If speaker changes, append the previous speaker's transcription and print
                if word_info.speaker_tag != current_speaker_tag:
                    final_transcription.append([])
                    final_transcription[-1].append(f"Speaker: {word_info.speaker_tag}")
                    
                    current_speaker_tag = word_info.speaker_tag  # Update the current speaker

                    transcription = " ".join(final_transcription[-2])
                    timestamp = time.time()  # Record when the transcription was generated
                    transcription_queue.put((timestamp, transcription))
                    
                final_transcription[-1].append(f"'{word_info.word}'")  # Add the word to the transcription

                   
                


                    
    
    # try:
    #     for response in responses:
    #         for result in response.results:
    #             if result.is_final and result.alternatives and result.alternatives[0]:
    #                 curr_trans = ""
    #                 words_info = result.alternatives[0].words
    #                 prev_speaker = words_info[0].speaker_tag
    #                 speaker_tags.append(prev_speaker)
                    
    #                 for word_info in words_info:
    #                     if word_info.speaker_tag != prev_speaker:
    #                         curr_trans = ""
    #                         prev_speaker = word_info.speaker_tag
    #                         speaker_tags.append(prev_speaker)
                            
    #                     curr_trans += word_info.word + " "
                    
    #                 transcript = curr_trans
    #                 transcripts.append(transcript)
    #                 timestamps.append(time.time())
                    

    #                 transcription_queue.put((timestamps[-1], transcripts[-1], speaker_tags[-1]))
         

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
    finally:
        process.terminate()

def play_video_with_audio_and_transcription(video_url):
    """Play video with synchronized audio and perform real-time transcription."""
    # yt-dlp options to fetch the best video URL
    ydl_opts = {
        "format": "best",            # Fetch the best video + audio format
        "quiet": True,               # Suppress output
        "no_warnings": True          # Suppress warnings
    }

    # Fetch video info and get the stream URL
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)
        video_stream_url = info["url"]

    # Fetch audio URL
    audio_opts = {"format": "bestaudio/best", "quiet": True, "no_warnings": True}
    with yt_dlp.YoutubeDL(audio_opts) as ydl:
        audio_info = ydl.extract_info(video_url, download=False)
        audio_stream_url = audio_info["url"]

    # Initialize OpenCV video capture
    cap = cv2.VideoCapture(video_stream_url)
    if not cap.isOpened():
        print("Error: Cannot open video stream.")
        return

    fps = cap.get(cv2.CAP_PROP_FPS)  # Get frame rate
    frame_delay = int(1000 / fps)  # Delay between frames in milliseconds

    # Initialize ffpyplayer for audio
    player = MediaPlayer(video_stream_url)

    # Queue for synchronized transcription
    transcription_queue = Queue()

    # Start transcription in a background thread
    transcription_thread = threading.Thread(
        target=transcribe_audio_stream, args=(audio_stream_url, transcription_queue)
    )
    transcription_thread.start()

    print("Press 'q' to quit the video stream.")
    while True:
        ret, frame = cap.read()
        if not ret:
            print("End of stream or cannot fetch frame.")
            break

        # Display video frame
        cv2.imshow('YouTube Video Stream', frame)

        # Play audio synchronously
        audio_frame, val = player.get_frame()
        if val != 'eof' and audio_frame:
            _, timestamp = audio_frame

            # Synchronize and display transcription
            while not transcription_queue.empty():
                transcription_time, transcription = transcription_queue.queue[0]
                if transcription_time <= time.time():  # Check if it's time to display the transcription
                    print(f"transcript: {transcription}")
                    transcription_queue.get()
                else:
                    break

        # Exit on pressing 'q'
        if cv2.waitKey(frame_delay) & 0xFF == ord('q'):
            break

    # Release resources
    cap.release()
    player.close_player()
    cv2.destroyAllWindows()

    # Wait for transcription thread to complete
    transcription_thread.join()

# Replace with your YouTube video URL
youtube_url = "https://www.youtube.com/watch?v=96Y6mc3C1Bg"  # Example video
play_video_with_audio_and_transcription(youtube_url)

def terminate_script():
    """Terminates the current Python process."""
    os._exit(0)

Press 'q' to quit the video stream.


[swscaler @ 0x128ac8000] [swscaler @ 0x149100000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118928000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118938000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118948000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118958000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118968000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118978000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118988000] No accelerated colorspace conversion found from yuv420p to rgb24.
[swscaler @ 0x128ac8000] [swscaler @ 0x118998000] No accelerated colorsp

transcript: 
transcript: 
transcript: 
transcript: 
transcript: Speaker: 1 'you' 'and' 'I' 'love' 'crime' 'junkies'
transcript: Speaker: 2 'love'
transcript: Speaker: 1 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips' 'we're' 'obsessed'
transcript: Speaker: 2 'yes'
transcript: 
transcript: Speaker: 1 'you' 'and' 'I' 'love' 'crime' 'junkies'
transcript: Speaker: 2 'love'
transcript: Speaker: 1 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips' 'we're' 'obsessed'
transcript: Speaker: 2 'yes'
transcript: 
transcript: Speaker: 1 'you' 'and' 'I' 'love' 'crime' 'junkies'
transcript: Speaker: 2 'love'
transcript: Speaker: 1 'love' 'that's' 'like' 'our' 'go-to' 'podcast' 'when' 'we're' 'doing' 'our' 'road' 'trips' 'we're' 'obsessed'
transcript: Speaker: 2 'yes'
transcript: 
transcript: Speaker: 1 'you' 'and' 'I' 'love' 'crime' 'junkies'
transcript: Speaker: 2 'love'
transcript: Speaker: 1 'love' 'that's' 'like' '

KeyboardInterrupt: 

: 

In [None]:
%pip install ffpyplayer