# Transcribe Audio or Video (Multilingual)
Author: Jiawen Zhu

In [2]:
from openai import OpenAI
import os
import subprocess
from pydub import AudioSegment

In [None]:
client = OpenAI() # provide OpenAI API key in .env

In [3]:
n = 12 # number of interviews

**Input Directory Structure**

```
/root
    /transcriptions
        whisper-transcribe.ipynb
    /recordings
        /P1
            P1.mp4
        /P2
            P2.mp3
        /P3
            P3.mkv
        ...
```

**Output Directory Structure**

```
/root
    /transcriptions
        whisper-transcribe.ipynb
        P1.txt
        P2.txt
        P3.txt
        ...
    /recordings
        /P1
            P1.mp4
            P1.mp3
        /P2
            P2.mp3
        /P3
            P3.mkv
            P3.mp3
        ...
```

# \[Optional\] Extract Audio (MP3) from Video Files (MP4/MKV)

In [None]:
# extract MP3 files
for pid in range(1, n + 1):
    print(f"Extracting audio for P{pid} ...")
    fileName = f"P{pid}"
    recordings_dir = f"../recordings/{fileName}"

    mp3_files = [file for file in os.listdir(recordings_dir) if file.endswith(".mp3")]
    if len(mp3_files) > 0:
        print(f"MP3 file already exists for {fileName}")
        continue

    # find video file
    mp4_files = [file for file in os.listdir(recordings_dir) if file.endswith(".mp4")]
    mkv_files = [file for file in os.listdir(recordings_dir) if file.endswith(".mkv")]

    if mp4_files:
        input_file = mp4_files[0]
    elif mkv_files:
        input_file = mkv_files[0]
    else:
        raise FileNotFoundError("No .mp4 or .mkv files found in the directory!")

    # Construct the input and output paths
    input_path = os.path.join(recordings_dir, input_file)
    output_path = os.path.join(recordings_dir, f"{fileName}.mp3")

    # extract audio
    subprocess.call(f"ffmpeg -i \"{input_path}\" \"{output_path}\"", shell=True)

# Transcribe Audio Files (MP3)

In [52]:
# transcribe long recordings (> 20 min) to account for OpenAI API token limit
def transcribeLongAudio(file_path, output_file, chunk_duration=20 * 60):
    audio = AudioSegment.from_file(file_path)
    audio_duration = len(audio) / 1000  # in seconds
    
    # split into chunks
    chunks = []
    for i in range(0, int(audio_duration), chunk_duration):
        start = i * 1000  # Start time in milliseconds
        end = min((i + chunk_duration) * 1000, len(audio))  # End time in ms
        chunks.append(audio[start:end])
    
    # transcribe chunks
    transcription = []
    for idx, chunk in enumerate(chunks):
        temp_chunk_path = f"chunk_{idx}.mp3"
        chunk.export(temp_chunk_path, format="mp3")
        
        with open(temp_chunk_path, "rb") as audio_file:
            chunk_transcription = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file
            )
            transcription.append(chunk_transcription.text)
        
        # clean up tmp files
        os.remove(temp_chunk_path)
    
    # concatenate all chunks
    full_transcription = "\n".join(transcription)
    
    # save to file
    if output_file:
        with open(output_file, "w") as f:
            f.write(full_transcription)

    return full_transcription

In [None]:
# transcribe MP3 files
for pid in range(1, n + 1):
    print(f"Transcribing P{pid} ...")

    # file
    fileName = f"P{pid}-{j}"
    filePath = f"../recordings/{fileName}/{fileName}.mp3"

    audio_file= open(filePath, "rb")
    audio = AudioSegment.from_file(filePath)
    audio_duration = len(audio) / 1000  # Convert to seconds

    # transcribe
    if audio_duration <= 20 * 60:
        transcription = client.audio.transcriptions.create(
            model="whisper-1", 
            file=audio_file
        )

        with open(f"{fileName}.txt", "w") as file:
            file.write(transcription.text)
    else: # transcribe recordings >20 min in chunks
        transcribeLongAudio(filePath, f"{fileName}.txt")
    
    print(f"{fileName} transcribed.\n")