In [8]:
from pathlib import Path
from pydub import AudioSegment
import whisper
import os
from fpdf import FPDF

# List of mp3 files to process
audio_folder = r"G:\My Drive\Gardening_2025\Strategy_MIT\Module_3\audio"
mp3_files = [f for f in os.listdir(audio_folder) if f.lower().endswith('.mp3')]

model = whisper.load_model("base")

# Transcribe each mp3 and save transcript
for mp3_file in mp3_files:
    audio_path = os.path.join(audio_folder, mp3_file)
    wav_path = audio_path.replace('.mp3', '.wav')
    
    # Convert MP3 to WAV
    audio = AudioSegment.from_mp3(audio_path)
    audio.export(wav_path, format="wav")
    
    # Transcribe
    result = model.transcribe(wav_path)
    transcript = result["text"]
    
    # Save transcript
    transcript_file = audio_path.replace('.mp3', '_transcript.txt')
    with open(transcript_file, "w", encoding="utf-8") as file:
        file.write(transcript)
    
    # Remove intermediate WAV file
    os.remove(wav_path)

# Collate all transcripts into a single PDF
def collate_transcripts_to_pdf(audio_folder, output_pdf):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)
    
    # Find all transcript files
    transcript_files = [f for f in os.listdir(audio_folder) if f.endswith('_transcript.txt')]
    transcript_files.sort()  # Optional: sort alphabetically

    for transcript_file in transcript_files:
        title = transcript_file.replace('_transcript.txt', '')
        pdf.add_page()
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, title, ln=True)
        pdf.set_font("Arial", size=12)
        with open(os.path.join(audio_folder, transcript_file), "r", encoding="utf-8") as file:
            text = file.read()
            safe_text = text.encode('latin-1', 'replace').decode('latin-1')
            pdf.multi_cell(0, 10, safe_text)
    
    pdf.output(output_pdf)

# Example usage:
collate_transcripts_to_pdf(
    audio_folder,
    os.path.join(audio_folder, "all_transcripts.pdf")
)