In [7]:
source_url = 'https://www.youtube.com/watch?v=IyQiaSS0x0A'
filename = 'mark-yang-basic'
ydl_opts = {
    'format': 'bestaudio/best',
    # 'outtmpl': '%(title)%.%(ext)s',
    'outtmpl': 'audio.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([source_url])

[youtube] Extracting URL: https://www.youtube.com/watch?v=IyQiaSS0x0A
[youtube] IyQiaSS0x0A: Downloading webpage
[youtube] IyQiaSS0x0A: Downloading ios player API JSON
[youtube] IyQiaSS0x0A: Downloading m3u8 information
[info] IyQiaSS0x0A: Downloading 1 format(s): 251
[download] Destination: audio.webm
[download] 100% of   88.90MiB in 00:00:01 at 52.02MiB/s    
[ExtractAudio] Destination: audio.mp3
Deleting original file audio.webm (pass -k to keep)


In [1]:
import openai
import os
from pydub import AudioSegment
import math

openai.api_key = os.environ.get('OPENAI_API_KEY')
audio_file_path = 'audio.mp3'

audio = AudioSegment.from_mp3(audio_file_path)
chunk_length_ms = 60000  # 1 minute chunks
chunks = math.ceil(len(audio) / chunk_length_ms)
print('chunks:', chunks)
chunk_files = []

for i in range(chunks):
    start_time = i * chunk_length_ms
    end_time = start_time + chunk_length_ms
    chunk = audio[start_time:end_time]
    chunk_file_path = f'audio_chunk_{i}.mp3'
    print('Processing chunk', chunk_file_path)
    if not os.path.exists(chunk_file_path):
        print('Exporting', chunk_file_path)        
        chunk.export(chunk_file_path, format='mp3')
    else:
        print('Found', chunk_file_path)        
    chunk_files.append(chunk_file_path)

# Step 3: Transcribe each chunk using OpenAI Whisper-1
def transcribe_audio(file_path):
    with open(file_path, 'rb') as audio_file:
        transcript = openai.Audio.transcribe(
            model="whisper-1",
            file=audio_file,
            response_format="text"
        )
    return transcript

transcripts = []
for chunk_file in chunk_files:
    transcript = transcribe_audio(chunk_file)
    print(transcript)
    transcripts.append(transcript)
    os.remove(chunk_file)

# Combine all transcriptions
full_transcript = '\n'.join(transcripts)

print(full_transcript)

# Cleanup: Remove chunk files
for chunk_file in chunk_files:
    if os.path.exists(chunk_file):
        os.remove(chunk_file)

chunks: 126
Processing chunk audio_chunk_0.mp3
Found audio_chunk_0.mp3
Processing chunk audio_chunk_1.mp3
Found audio_chunk_1.mp3
Processing chunk audio_chunk_2.mp3
Found audio_chunk_2.mp3
Processing chunk audio_chunk_3.mp3
Found audio_chunk_3.mp3
Processing chunk audio_chunk_4.mp3
Found audio_chunk_4.mp3
Processing chunk audio_chunk_5.mp3
Found audio_chunk_5.mp3
Processing chunk audio_chunk_6.mp3
Found audio_chunk_6.mp3
Processing chunk audio_chunk_7.mp3
Found audio_chunk_7.mp3
Processing chunk audio_chunk_8.mp3
Found audio_chunk_8.mp3
Processing chunk audio_chunk_9.mp3
Found audio_chunk_9.mp3
Processing chunk audio_chunk_10.mp3
Found audio_chunk_10.mp3
Processing chunk audio_chunk_11.mp3
Found audio_chunk_11.mp3
Processing chunk audio_chunk_12.mp3
Found audio_chunk_12.mp3
Processing chunk audio_chunk_13.mp3
Found audio_chunk_13.mp3
Processing chunk audio_chunk_14.mp3
Found audio_chunk_14.mp3
Processing chunk audio_chunk_15.mp3
Found audio_chunk_15.mp3
Processing chunk audio_chunk_16.

In [None]:
# ChatGPT version:

import yt_dlp
import openai
import os
from pydub import AudioSegment
import math
import json

class YouTubeTranscriber:
    def __init__(self, source_url, output_file=None):
        self.source_url = source_url
        self.output_file = output_file
        self.audio_file_path = None
        self.chunk_files = []
        self.transcripts = []
        self.chunk_length_ms = 60000  # 1 minute chunks
        self.state_file = 'transcription_state.json'
        openai.api_key = os.getenv('OPENAI_API_KEY')

    def download_audio(self):
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': '%(title)s.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(self.source_url, download=True)
            video_title = info_dict.get('title', 'audio')
            self.audio_file_path = f"{video_title}.mp3"
            if not self.output_file:
                self.output_file = f"{video_title}.txt"

    def create_audio_chunks(self):
        if not self.audio_file_path:
            raise ValueError("Audio file path not set. Please download the audio first.")
        
        audio = AudioSegment.from_mp3(self.audio_file_path)
        chunks = math.ceil(len(audio) / self.chunk_length_ms)
        print('chunks:', chunks)

        for i in range(chunks):
            start_time = i * self.chunk_length_ms
            end_time = start_time + self.chunk_length_ms
            chunk = audio[start_time:end_time]
            chunk_file_path = f'audio_chunk_{i}.mp3'
            print('Processing chunk', chunk_file_path)
            if not os.path.exists(chunk_file_path):
                print('Exporting', chunk_file_path)
                chunk.export(chunk_file_path, format='mp3')
            else:
                print('Found', chunk_file_path)
            self.chunk_files.append(chunk_file_path)

    def transcribe_audio(self, file_path):
        with open(file_path, 'rb') as audio_file:
            transcript = openai.Audio.transcribe(
                model="whisper-1",
                file=audio_file,
                response_format="text"
            )
        return transcript

    def transcribe_chunks(self):
        for chunk_file in self.chunk_files:
            transcript = self.transcribe_audio(chunk_file)
            print(transcript)
            self.transcripts.append(transcript)
            os.remove(chunk_file)
            self.save_state()

    def save_state(self):
        state = {
            'source_url': self.source_url,
            'audio_file_path': self.audio_file_path,
            'chunk_files': self.chunk_files,
            'transcripts': self.transcripts,
            'output_file': self.output_file
        }
        with open(self.state_file, 'w') as f:
            json.dump(state, f)

    def load_state(self):
        if os.path.exists(self.state_file):
            with open(self.state_file, 'r') as f:
                state = json.load(f)
                self.source_url = state['source_url']
                self.audio_file_path = state['audio_file_path']
                self.chunk_files = state['chunk_files']
                self.transcripts = state['transcripts']
                self.output_file = state['output_file']

    def combine_transcripts(self):
        full_transcript = '\n'.join(self.transcripts)
        print(full_transcript)
        with open(self.output_file, 'w') as f:
            f.write(full_transcript)
        return full_transcript

    def run(self):
        self.load_state()
        if not self.audio_file_path:
            self.download_audio()
        if not self.chunk_files:
            self.create_audio_chunks()
        self.transcribe_chunks()
        self.combine_transcripts()
        self.cleanup()

    def cleanup(self):
        if os.path.exists(self.state_file):
            os.remove(self.state_file)
        if os.path.exists(self.audio_file_path):
            os.remove(self.audio_file_path)
        for chunk_file in self.chunk_files:
            if os.path.exists(chunk_file):
                os.remove(chunk_file)

# Usage
transcriber = YouTubeTranscriber(source_url='https://www.youtube.com/watch?v=IyQiaSS0x0A')
transcriber.run()
