In [1]:
import subprocess
import whisperx
import os
import gc
import torch
import csv

torch.backends.cuda.matmul.allow_tf32= False
torch.backends.cudnn.allow_tf32= False

  from .autonotebook import tqdm as notebook_tqdm
INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [2]:
from dotenv import load_dotenv
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")

In [3]:
print("PyTorch version:", torch.__version__)
print("CUDA version used to build PyTorch:", torch.version.cuda)
print("CUDA Available:", torch.cuda.is_available())

PyTorch version: 2.6.0+cu118
CUDA version used to build PyTorch: 11.8
CUDA Available: True


In [4]:
# Step 1: Extract Audio from Video
def extract_audio_from_video(video_file: str, audio_file: str):
    print(f"Starting audio extraction from video: {video_file}")
    subprocess.run([
        "ffmpeg",
        "-i", video_file,
        "-vn",  # no video, only audio
        "-acodec", "pcm_s16le",  # audio codec (WAV format)
        "-ar", "16000",  # sample rate
        "-ac", "1",  # number of audio channels
        audio_file
    ], check=True)
    print(f"Audio extraction completed. Audio saved to: {audio_file}")

# Step 2: Transcribe Audio with WhisperX
def transcribe_audio_with_whisperx(audio_file: str):
    print(f"Starting transcription of audio file: {audio_file}")
    device = "cuda"
    batch_size = 6
    compute_type = "float16"
    model = whisperx.load_model("large-v3", device, vad_method="silero", compute_type=compute_type,language='en')
    audio = whisperx.load_audio(audio_file)
    transcription_result = model.transcribe(audio, batch_size=batch_size)
    print(f"Transcription complete. Number of segments: {len(transcription_result['segments'])}")
    model_a, metadata = whisperx.load_align_model(language_code=transcription_result["language"], device=device)
    aligned_result = whisperx.align(transcription_result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    print("Alignment complete.")
    diarize_model = whisperx.DiarizationPipeline(model_name="pyannote/speaker-diarization-3.1",use_auth_token="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", device=device)
    # add min/max number of speakers if known
    diarize_segments = diarize_model(audio)
    # diarize_model(audio, num_speakers= total_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
    diarize_result = whisperx.assign_word_speakers(diarize_segments, aligned_result)
    # print(result["segments"]) # segments are now assigned speaker IDs
    print("Diarization complete.")
    del audio
    gc.collect()
    if 'torch' in globals():
        torch.cuda.empty_cache()
    del model
    gc.collect()
    if 'torch' in globals():
        torch.cuda.empty_cache()
    return diarize_result

# Step 3: Translate Transcription to Spanish
def translate_transcription_to_spanish(transcription_result):
    print("Starting translation of transcription to Spanish.")
    translator = Translator()
    english_texts = [segment["text"] for segment in transcription_result["segments"]]
    spanish_translations = []

    for idx, text in enumerate(english_texts):
        print(f"Translating segment {idx + 1}/{len(english_texts)}")
        translated = translator.translate(text, src='en', dest='es')
        spanish_translations.append(translated.text)
    
    print("Translation to Spanish complete.")
    return spanish_translations

# Step 4: Generate SRT File
def generate_srt_file(transcription_result, translations, srt_file_path: str):
    print(f"Generating SRT file: {srt_file_path}")
    with open(srt_file_path, "w", encoding="utf-8") as srt_file:
        for idx, (segment, text) in enumerate(zip(transcription_result["segments"], translations)):
            start_time = segment["start"]
            end_time = segment["end"]

            start_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
            end_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"

            srt_file.write(f"{idx + 1}\n")
            srt_file.write(f"{start_str} --> {end_str}\n")
            srt_file.write(f"{text}\n\n")
    print(f"SRT file generation complete: {srt_file_path}")

    # Check for potential issues in segment timestamps
    for idx, segment in enumerate(transcription_result["segments"]):
        if segment["end"] <= segment["start"]:
            print(f"Warning: Misaligned timestamps in segment {idx + 1}. Start time: {segment['start']}, End time: {segment['end']}.")

In [5]:
video_file = r"D:\SOKM\01 Introduction SoKM 2024 - 2025 4k.mp4"
# video_file = r"E:\HoloOrbits\Videos\112923_wg1_red\EXPORT\112923_wg1_red_sync.mp4"
base_filename, _ = os.path.splitext(video_file)
audio_file = f"{base_filename}_audio.wav"
english_srt_file_path = f"{base_filename}_transcript_english.srt"
spanish_srt_file_path = f"{base_filename}_transcript_spanish.srt"

In [5]:
# Step 1: Extract audio from video
print("Step 1: Extracting audio from video.")
extract_audio_from_video(video_file, audio_file)


Step 1: Extracting audio from video.
Starting audio extraction from video: E:\HoloOrbits\Videos\112923_wg1_red\EXPORT\112923_wg1_red_sync.mp4
Audio extraction completed. Audio saved to: E:\HoloOrbits\Videos\112923_wg1_red\EXPORT\112923_wg1_red_sync_audio.wav


In [7]:
# Step 2: Transcribe audio with WhisperX
print("Step 2: Transcribing audio with WhisperX.")
transcription_result = transcribe_audio_with_whisperx(audio_file)

Step 2: Transcribing audio with WhisperX.
Starting transcription of audio file: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio.wav
>>Performing voice activity detection using Silero...


Using cache found in C:\Users\robin/.cache\torch\hub\snakers4_silero-vad_master


Transcription complete. Number of segments: 93
Alignment complete.


  if ismodule(module) and hasattr(module, '__file__'):
  std = sequences.std(dim=-1, correction=1)


Diarization complete.


In [19]:
print(transcription_result['segments'][0]['words'][0]['speaker'])

SPEAKER_00


In [20]:
print(transcription_result['segments'][0]['text'])
print(len(transcription_result['segments'][0]['text']))
print(len(transcription_result['segments']))

 there is something inside all of us that calls out for more.
61
684


In [21]:
a = 0
for i in range(len(transcription_result['segments'])):
    # if len(transcription_result['segments'][i]['text']) > 62:  # Filter based on length
    a += len(transcription_result['segments'][i]['text'])
print(a)


37467


In [14]:
#Step 2b: storing temporary csv output
# Specify the filename for the CSV file
csv_filename = audio_file.rsplit('.',1)[0] +'_english.csv'

# Open a CSV file to write to
with open(csv_filename, 'w', newline='') as csvfile:
    fieldnames = ['Segment Start', 'Segment End', 'Segment Text','Speaker']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for segment in transcription_result["segments"]:
        segment_start = segment['start']
        segment_end = segment['end']
        segment_text = segment['text']
        # Check if the words list is non-empty and if the first word has a 'speaker' key
        if segment.get('words') and 'speaker' in segment['words'][0]:
                segment_speaker = segment['words'][0]['speaker']
        else:
                speaker_counts = {}  # Dictionary to count occurrences of each speaker
        
                for word in segment.get('words', []):  # Ensure 'words' exists
                        speaker = word.get('speaker')
                        if speaker:  # Only count non-empty speaker values
                                speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
                        
                # Determine the most frequent speaker
                if speaker_counts:
                        segment_speaker = max(speaker_counts, key=speaker_counts.get)
                else:
                        segment_speaker = 'Unknown'  # Default if no speakers exist
        writer.writerow({
                'Segment Start': segment_start,
                'Segment End': segment_end,
                'Segment Text': segment_text,
                'Speaker': segment_speaker
        })

print(f"Data successfully written to {csv_filename}")

Data successfully written to D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english.csv


In [15]:
 # Step 3: Generate English SRT file
print("Step 3: Generating English SRT file.")
english_translations = [segment["text"] for segment in transcription_result["segments"]]
generate_srt_file(transcription_result, english_translations, english_srt_file_path)


Step 3: Generating English SRT file.
Generating SRT file: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_transcript_english.srt
SRT file generation complete: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_transcript_english.srt


In [30]:
#Create .srt using whisperx utils
from whisperx.SubtitlesProcessor import SubtitlesProcessor

srt_result = transcription_result
srt_result["language"] = 'en'
# Remove speaker info from the transcription result
for segment in srt_result["segments"]:
    segment.pop("speaker", None)
    for word in segment.get("words", []):
        word.pop("speaker", None)

srt_output = f"{base_filename}_subprocessor.srt"
subtitles_proccessor = SubtitlesProcessor(
    srt_result["segments"],
    'en', # str, two letter code to identify the language
    max_line_length=21, # int, around 100 has been working for me
    min_char_length_splitter=21, # int, around 70 has been working for me
    is_vtt=False, # bool, true for vtt, false for srt format
)
subtitles_proccessor.save(srt_output, advanced_splitting=True) # output_path is a str with your desired filename


2018

In [None]:

# Step 4: Translate transcription to Spanish
print("Step 4: Translating transcription to Spanish.")
spanish_translations = translate_transcription_to_spanish(transcription_result)


In [None]:

# Step 5: Generate Spanish SRT file
print("Step 5: Generating Spanish SRT file.")
generate_srt_file(transcription_result, spanish_translations, spanish_srt_file_path)

Translate using deepl
Cost $7 (monthly cost) + ~$1 (50,000characters)


In [17]:
#testing deepl
import deepl 
from dotenv import load_dotenv
import os
load_dotenv()

DEEPL_AUTH_KEY = os.getenv("DEEPL_AUTH_KEY")

translator = deepl.Translator(DEEPL_AUTH_KEY)

text = transcription_result['segments'][0]['text']
translated = translator.translate_text(text, target_lang="ES")

print(translated.text)
usage = translator.get_usage()
print(f"Character usage: {usage.character.count}/{usage.character.limit}")

INFO:deepl:Request to DeepL API method=POST url=https://api-free.deepl.com/v2/translate
INFO:deepl:DeepL API response status_code=200 url=https://api-free.deepl.com/v2/translate
INFO:deepl:Request to DeepL API method=GET url=https://api-free.deepl.com/v2/usage
INFO:deepl:DeepL API response status_code=200 url=https://api-free.deepl.com/v2/usage


 hay algo dentro de todos nosotros que pide más.
Character usage: 37467/500000


In [34]:
import pandas as pd

# df = pd.read_csv(r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_test.csv')
df = pd.read_csv(r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed.csv')

In [35]:
from dotenv import load_dotenv
import os
import deepl 
load_dotenv()

DEEPL_AUTH_KEY = os.getenv("DEEPL_AUTH_KEY")

translator = deepl.Translator(DEEPL_AUTH_KEY)

usage = translator.get_usage()
print(f"Initial Character usage: {usage.character.count}/{usage.character.limit}")

if 'Segment Text' in df.columns:
    # Extract the 'text' column as a list
    texts = df['Segment Text'].tolist()

    # Translate the texts in batches
    df['Translated Text'] = translator.translate_text(texts, target_lang="ES")

    # Drop the original 'text' column
    df = df.drop(columns=['Segment Text'])

    df.to_csv(r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_deepl.csv', index=False, encoding='utf-8-sig')

    print("Translation completed and saved to '01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_deepl.csv'")

else:
    print("The 'text' column is not present in the CSV file.")

usage = translator.get_usage()
print(f"Final Character usage: {usage.character.count}/{usage.character.limit}")


INFO:deepl:Request to DeepL API method=GET url=https://api-free.deepl.com/v2/usage
INFO:deepl:DeepL API response status_code=200 url=https://api-free.deepl.com/v2/usage
INFO:deepl:Request to DeepL API method=POST url=https://api-free.deepl.com/v2/translate


Initial Character usage: 0/500000


INFO:deepl:DeepL API response status_code=200 url=https://api-free.deepl.com/v2/translate
INFO:deepl:Request to DeepL API method=GET url=https://api-free.deepl.com/v2/usage
INFO:deepl:DeepL API response status_code=200 url=https://api-free.deepl.com/v2/usage


Translation completed and saved to '01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_deepl.csv'
Final Character usage: 37467/500000


In [36]:
# Define file paths
csv_file_path = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_deepl.csv'
srt_file_path = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_deepl.srt'

# Load the CSV file
df2 = pd.read_csv(csv_file_path)

# Check if required columns exist
required_columns = ["Segment Start", "Segment End", 'Translated Text']
if not all(col in df2.columns for col in required_columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Generate the SRT file
print(f"Generating SRT file: {srt_file_path}")
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
    for idx in range(len(df2)):
        start_time = df2["Segment Start"].iloc[idx]
        end_time = df2["Segment End"].iloc[idx]

        # Format start and end times
        start_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
        end_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"

        # Write to SRT file
        srt_file.write(f"{idx + 1}\n")
        srt_file.write(f"{start_str} --> {end_str}\n")
        srt_file.write(f"{df2['Translated Text'].iloc[idx]}\n\n")

print(f"SRT file generation complete: {srt_file_path}")


Generating SRT file: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_deepl.srt
SRT file generation complete: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_deepl.srt


Translate using Local Helsinki model

In [28]:
import pandas as pd

# df = pd.read_csv(r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_test.csv')
df = pd.read_csv(r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed.csv')

In [25]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-tc-big-en-es'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# model.eval()



In [29]:
from tqdm import tqdm
 
def translate_batch(texts, batch_size=8):
    translations = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Generate the translated text
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=512, num_beams=8, early_stopping=True)

        # Decode the output and store the translated texts
        batch_translations = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        translations.extend(batch_translations)
    return translations


In [30]:
if 'Segment Text' in df.columns:
    # Extract the 'text' column as a list
    texts = df['Segment Text'].tolist()

    # Translate the texts in batches
    df['Translated Text'] = translate_batch(texts, batch_size=8)

    # Drop the original 'text' column
    df = df.drop(columns=['Segment Text'])

    df.to_csv(r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_helsinki.csv', index=False, encoding='utf-8-sig')

    print("Translation completed and saved to '01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_helsinki.csv'")
else:
    print("The 'text' column is not present in the CSV file.")


Translating: 100%|██████████| 81/81 [13:53<00:00, 10.28s/batch]

Translation completed and saved to '01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_helsinki.csv'





In [31]:
# Define file paths
csv_file_path = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_helsinki.csv'
srt_file_path = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_helsinki.srt'

# Load the CSV file
df2 = pd.read_csv(csv_file_path)

# Check if required columns exist
required_columns = ["Segment Start", "Segment End", 'Translated Text']
if not all(col in df2.columns for col in required_columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Generate the SRT file
print(f"Generating SRT file: {srt_file_path}")
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
    for idx in range(len(df2)):
        start_time = df2["Segment Start"].iloc[idx]
        end_time = df2["Segment End"].iloc[idx]

        # Format start and end times
        start_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
        end_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"

        # Write to SRT file
        srt_file.write(f"{idx + 1}\n")
        srt_file.write(f"{start_str} --> {end_str}\n")
        srt_file.write(f"{df2['Translated Text'].iloc[idx]}\n\n")

print(f"SRT file generation complete: {srt_file_path}")


Generating SRT file: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_helsinki.srt
SRT file generation complete: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_helsinki.srt


Translate using ChatGPT


In [19]:
#Testing ChatGPT API

from openai import OpenAI
from dotenv import load_dotenv
import os

# Load environment variables from a .env file
load_dotenv()

# Set your API key
client = OpenAI(
    api_key = os.getenv("OPENAI_API_KEY")
)

# Define the chat messages
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are an expert translator. Translate the following text from English to Spanish.",
        },
        {
            "role": "user",
            "content": "There is something inside all of us that calls out for more.",
        }
    ],
    model="gpt-4o-mini",
    temperature = 1,
    max_tokens = 100,
)

# Extract and print the translation
translation = chat_completion.choices[0].message.content
print(f"{translation}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Hay algo dentro de todos nosotros que clama por más.


In [1]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load environment variables from a .env file
load_dotenv()

# Set your API key
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

# Function to process text in batches
def translate_batch(batch, model="chatgpt-4o-latest", temperature=1, max_tokens=500):
# def translate_batch(batch, model="gpt-4o-mini", temperature=1, max_tokens=500): # model = "chatgpt-4o-latest"
    translated_batch = []
    for text in batch:
        try:
            # Call OpenAI API for translation
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "You are an expert translator. Translate the following text from English to Spanish.",
                    },
                    {
                        "role": "user",
                        "content": text,
                    }
                ],
                model=model,
                temperature=temperature,
                max_tokens=max_tokens,
            )
            translation = chat_completion.choices[0].message.content
            translated_batch.append(translation)
        except Exception as e:
            print(f"Error translating text: {text}. Error: {e}")
            translated_batch.append(None)  # Add None for failed translations
    return translated_batch

# Main processing function
def process_csv(input_file, output_file, batch_size=50):
    df = pd.read_csv(input_file)

    if 'Segment Text' in df.columns:
        texts = df['Segment Text'].tolist()
        translated_texts = []

        # Process texts in batches
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            print(f"Processing batch {i // batch_size + 1}/{(len(texts) + batch_size - 1) // batch_size}...")
            translated_batch = translate_batch(batch)
            translated_texts.extend(translated_batch)

        # Add translations to the DataFrame
        df['Translated Text'] = translated_texts

        # Drop the original 'Segment Text' column
        df = df.drop(columns=['Segment Text'])

        # Save the updated DataFrame to a new CSV file
        df.to_csv(output_file, index=False, encoding='utf-8-sig')
        print(f"Translation completed and saved to '{output_file}'")
    else:
        print("The 'Segment Text' column is not present in the CSV file.")

# Define file paths
input_csv = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed.csv'
output_csv = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_gpt4o.csv'

# Run the process
process_csv(input_csv, output_csv, batch_size=50)


Processing batch 1/13...
Processing batch 2/13...
Processing batch 3/13...
Processing batch 4/13...
Processing batch 5/13...
Processing batch 6/13...
Processing batch 7/13...
Processing batch 8/13...
Processing batch 9/13...
Processing batch 10/13...
Processing batch 11/13...
Processing batch 12/13...
Processing batch 13/13...
Translation completed and saved to 'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_gpt4o.csv'


In [None]:
# Define file paths
csv_file_path = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_gpt4o.csv'
srt_file_path = r'D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed_translated_gpt4o.srt'

# Load the CSV file
df2 = pd.read_csv(csv_file_path)

# Check if required columns exist
required_columns = ["Segment Start", "Segment End", 'Translated Text']
if not all(col in df2.columns for col in required_columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Generate the SRT file
print(f"Generating SRT file: {srt_file_path}")
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
    for idx in range(len(df2)):
        start_time = df2["Segment Start"].iloc[idx]
        end_time = df2["Segment End"].iloc[idx]

        # Format start and end times
        start_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
        end_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"

        # Write to SRT file
        srt_file.write(f"{idx + 1}\n")
        srt_file.write(f"{start_str} --> {end_str}\n")
        srt_file.write(f"{df2['Translated Text'].iloc[idx]}\n\n")

print(f"SRT file generation complete: {srt_file_path}")

Generating SRT file: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed.srt
SRT file generation complete: D:\SOKM\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed.srt


In [None]:
print(f"Starting translation of audio file: {audio_file}")
device = "cuda"
batch_size = 6
compute_type = "float16"
model = whisperx.load_model("large-v3", device, vad_method="silero", compute_type=compute_type,language='es')
audio = whisperx.load_audio(audio_file)
translation_result = model.transcribe(audio, batch_size=batch_size, task="translate", language="es")
print(f"Transcription complete. Number of segments: {len(translation_result['segments'])}")
model_a, metadata = whisperx.load_align_model(language_code=translation_result["language"], device=device)
aligned_result = whisperx.align(translation_result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
print("Alignment complete.")

del audio
gc.collect()
if 'torch' in globals():
    torch.cuda.empty_cache()
del model
gc.collect()
if 'torch' in globals():
    torch.cuda.empty_cache()


Transcription complete. Number of segments: 93


Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_voxpopuli_base_10k_asr_es.pt" to C:\Users\robin/.cache\torch\hub\checkpoints\wav2vec2_voxpopuli_base_10k_asr_es.pt
100%|██████████| 360M/360M [00:03<00:00, 106MB/s]  


Alignment complete.


In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda"
batch_size = 6
torch_dtype = torch.float16

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

result = pipe(audio_file, generate_kwargs={"language": "spanish"})
print(result["text"])
print(result["chunks"])