Using whisperx to transcribe the video

In [1]:
import subprocess
import whisperx
import os
import gc
import torch
import csv
import json

torch.backends.cuda.matmul.allow_tf32= False
torch.backends.cudnn.allow_tf32= False

from dotenv import load_dotenv
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [2]:
print("PyTorch version:", torch.__version__)
print("CUDA version used to build PyTorch:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())

PyTorch version: 2.6.0+cu118
CUDA version used to build PyTorch: 11.8
CUDA Available: True


In [3]:
# Function definitions

# Step 1: Extract Audio from Video
def extract_audio_from_video(video_file: str, audio_file: str):
    print(f"Starting audio extraction from video: {video_file}")
    subprocess.run([
        "ffmpeg",
        "-i", video_file,
        "-vn",  # no video, only audio
        "-acodec", "pcm_s16le",  # audio codec (WAV format)
        "-ar", "16000",  # sample rate
        "-ac", "1",  # number of audio channels
        audio_file
    ], check=True)
    print(f"Audio extraction completed. Audio saved to: {audio_file}")

# Step 2: Transcribe Audio with WhisperX
def transcribe_audio_with_whisperx(audio_file: str):
    print(f"Starting transcription of audio file: {audio_file}")
    device = "cuda"
    batch_size = 6
    compute_type = "float16"
    model = whisperx.load_model("large-v3", device, vad_method="silero", compute_type=compute_type,language='en')
    # model = whisperx.load_model("large-v3", device, compute_type=compute_type,language='en')
    audio = whisperx.load_audio(audio_file)
    transcription_result = model.transcribe(audio, batch_size=batch_size)
    print(f"Transcription complete. Number of segments: {len(transcription_result['segments'])}")
    model_a, metadata = whisperx.load_align_model(language_code=transcription_result["language"], device=device)
    aligned_result = whisperx.align(transcription_result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    print("Alignment complete.")
    # diarize_model = whisperx.DiarizationPipeline(model_name="pyannote/speaker-diarization-3.1",use_auth_token="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", device=device)
    # # add min/max number of speakers if known
    # diarize_segments = diarize_model(audio)
    # # diarize_model(audio, num_speakers= total_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
    # diarize_result = whisperx.assign_word_speakers(diarize_segments, aligned_result)
    # # print(result["segments"]) # segments are now assigned speaker IDs
    # print("Diarization complete.")
    del audio
    gc.collect()
    if 'torch' in globals():
        torch.cuda.empty_cache()
    del model
    gc.collect()
    if 'torch' in globals():
        torch.cuda.empty_cache()
    return aligned_result

# Step 3: Generate SRT File
def generate_srt_file(transcription_result, translations, srt_file_path: str):
    print(f"Generating SRT file: {srt_file_path}")
    with open(srt_file_path, "w", encoding="utf-8") as srt_file:
        for idx, (segment, text) in enumerate(zip(transcription_result["segments"], translations)):
            start_time = segment["start"]
            end_time = segment["end"]

            start_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
            end_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"

            srt_file.write(f"{idx + 1}\n")
            srt_file.write(f"{start_str} --> {end_str}\n")
            srt_file.write(f"{text}\n\n")
    print(f"SRT file generation complete: {srt_file_path}")

    # Check for potential issues in segment timestamps
    for idx, segment in enumerate(transcription_result["segments"]):
        if segment["end"] <= segment["start"]:
            print(f"Warning: Misaligned timestamps in segment {idx + 1}. Start time: {segment['start']}, End time: {segment['end']}.")

In [4]:
# Implementation of the main function

video_file = r"D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025.mp4"
base_filename, _ = os.path.splitext(video_file)
audio_file = f"{base_filename}_audio.wav"
english_srt_file_path = f"{base_filename}_transcript_english.srt"
spanish_srt_file_path = f"{base_filename}_transcript_spanish.srt"


In [7]:
# Step 1: Extract audio from video
print("Step 1: Extracting audio from video.")
extract_audio_from_video(video_file, audio_file)

Step 1: Extracting audio from video.
Starting audio extraction from video: D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025.mp4
Audio extraction completed. Audio saved to: D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_audio.wav


In [5]:
# Step 2: Transcribe audio with WhisperX
print("Step 2: Transcribing audio with WhisperX.")
transcription_result = transcribe_audio_with_whisperx(audio_file)

Step 2: Transcribing audio with WhisperX.
Starting transcription of audio file: D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_audio.wav
>>Performing voice activity detection using Silero...


Using cache found in C:\Users\robin/.cache\torch\hub\snakers4_silero-vad_master


Transcription complete. Number of segments: 89
Alignment complete.


In [6]:
#Step 2b: storing temporary csv output
# Specify the filename for the CSV file
csv_filename = audio_file.rsplit('.',1)[0] +'_english_v3s.csv'

# Open a CSV file to write to
with open(csv_filename, 'w', newline='') as csvfile:
#     fieldnames = ['Segment Start', 'Segment End', 'Segment Text','Speaker']
    fieldnames = ['Segment Start', 'Segment End', 'Segment Text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for segment in transcription_result["segments"]:
        segment_start = segment['start']
        segment_end = segment['end']
        segment_text = segment['text']
        # # Check if the words list is non-empty and if the first word has a 'speaker' key
        # if segment.get('words') and 'speaker' in segment['words'][0]:
        #         segment_speaker = segment['words'][0]['speaker']
        # else:
        #         speaker_counts = {}  # Dictionary to count occurrences of each speaker
        
        #         for word in segment.get('words', []):  # Ensure 'words' exists
        #                 speaker = word.get('speaker')
        #                 if speaker:  # Only count non-empty speaker values
        #                         speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
                        
        #         # Determine the most frequent speaker
        #         if speaker_counts:
        #                 segment_speaker = max(speaker_counts, key=speaker_counts.get)
        #         else:
        #                 segment_speaker = 'Unknown'  # Default if no speakers exist
        writer.writerow({
                'Segment Start': segment_start,
                'Segment End': segment_end,
                'Segment Text': segment_text,
                # 'Speaker': segment_speaker
        })

print(f"Data successfully written to {csv_filename}")

Data successfully written to D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_audio_english_v3s.csv


In [7]:
#Step 2c: storing temporary json output
json_filename = audio_file.rsplit('.',1)[0] +'_english_v3s.json'
with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(transcription_result, json_file, ensure_ascii=False, indent=4)
    
    

In [8]:
# Step 3: Generate English SRT file
print("Step 3: Generating English SRT file.")
english_translations = [segment["text"] for segment in transcription_result["segments"]]
generate_srt_file(transcription_result, english_translations, english_srt_file_path)

Step 3: Generating English SRT file.
Generating SRT file: D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_transcript_english.srt
SRT file generation complete: D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_transcript_english.srt
