In [2]:
import os
import subprocess
import cv2
import yt_dlp
from google.cloud import speech
from queue import Queue

# Set Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/aditya/Downloads/seismic-rarity-427422-p7-ab3b4a8726ef.json"

def format_time(time):
    ms = int((time % 1) * 60)
    seconds = int(time // 1)
    return f'{seconds}:{ms}'

def transcribe_audio_stream(audio_url):
    """Stream audio for transcription using Google Cloud Speech-to-Text."""
    client = speech.SpeechClient()

    # Use ffmpeg to convert audio stream to raw PCM data
    ffmpeg_command = [
        "ffmpeg", "-i", audio_url, "-f", "s16le", "-ac", "1", "-ar", "16000",
        "-loglevel", "quiet", "pipe:1"
    ]
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    diarization_config = speech.SpeakerDiarizationConfig(enable_speaker_diarization = True)

    streaming_config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        diarization_config = diarization_config
    )
    
    streaming_request = speech.StreamingRecognitionConfig(config=streaming_config, interim_results=True)

    def audio_generator():
        while True:
            data = process.stdout.read(4096)
            if not data:
                break
            yield data

    requests = (speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in audio_generator())
    responses = client.streaming_recognize(config=streaming_request, requests=requests)

    try:
        for response in responses:
            result = response.results[-1]
            words_info = result.alternatives[0].words
            
            current_speaker = None
            current_sentence = []
            current_start_time = None
            transcription = []
            
            for word_info in words_info:
                if current_speaker is None:
                    current_speaker = word_info.speaker_tag
                    current_start_time = word_info.start_time.total_seconds()
                if word_info.speaker_tag != current_speaker:
                    # Speaker changed, create a new sentence
                    sentence = " ".join([word.word for word in current_sentence])
                    start_time = format_time(current_start_time)
                    transcription.append(f"speaker {current_speaker} @ {start_time} {sentence}")
                    current_speaker = word_info.speaker_tag
                    current_sentence = [word_info]
                    current_start_time = word_info.start_time.total_seconds()
                else:
            # Same speaker, add to current sentence
                    current_sentence.append(word_info)
    
    # Add the last sentence if there's anything
        if current_sentence:
            sentence = " ".join([word.word for word in current_sentence])
            start_time = format_time(current_start_time)
            transcription.append(f"speaker {current_speaker} @ {start_time} {sentence}")
            print(transcription)


        
    except Exception as e:
        print(f"Transcription error: {e}")
    finally:
        process.terminate()

def play_video_with_transcription(video_url, audio_url):
    """Play video using OpenCV while transcribing audio."""
    cap = cv2.VideoCapture(video_url)
    if not cap.isOpened():
        print("Error: Could not open video stream.")
        return

    print("Press 'q' to quit the video stream.")

    # Queue for synchronized transcription
    
    # Start transcription in a background process
    import threading
    transcription_thread = threading.Thread(target=transcribe_audio_stream, args=(audio_url,))
    transcription_thread.start()

    while True:
        ret, frame = cap.read()
        if not ret:
            print("End of video stream.")
            break

        # Display video
        cv2.imshow("YouTube Video Stream", frame)

        # Exit on 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

    # Wait for transcription to complete
    transcription_thread.join()

if __name__ == "__main__":
    youtube_url = "https://www.youtube.com/watch?v=96Y6mc3C1Bg"

    # Fetch video and audio URLs
    ydl_opts = {"format": "best", "quiet": True, "no_warnings": True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(youtube_url, download=False)
        video_url = info["url"]

    audio_opts = {"format": "bestaudio/best", "quiet": True, "no_warnings": True}
    with yt_dlp.YoutubeDL(audio_opts) as ydl:
        audio_info = ydl.extract_info(youtube_url, download=False)
        audio_url = audio_info["url"]

    # Play video with transcription
    play_video_with_transcription(video_url, audio_url)


Press 'q' to quit the video stream.


KeyboardInterrupt: 

['speaker 1 @ 0:0 you and I love crime junkies', 'speaker 2 @ 3:0 love', "speaker 1 @ 3:42 love that's like our go-to podcast when we're doing our road trips we're obsessed", 'speaker 2 @ 10:24 yes', "speaker 1 @ 12:5 so true crime is a huge Niche I feel like everyone listens to it and if you don't like you hear about it I don't know so with it finally being October and things get a little spookier I figured I'm going to test my limits as a podcast"]
