# Extract audio as an mp3 file from a video file


In [1]:
import moviepy.editor as mp

In [3]:
# Function to extract audio from video and save as mp3 in a folder named after the source file inside the ./source_files/ folder
import os
import re


def extract_audio_from_video(video_path):
    # Load the video file
    video_clip = mp.VideoFileClip(video_path)

    # Extract audio from the video
    audio_clip = video_clip.audio

    # Extract the base name without the file extension
    base_name = os.path.basename(video_path)
    base_name_without_ext = re.sub(r"\.[^.]+$", "", base_name)

    # Create the directory path
    directory_path = f"./source_files/{base_name_without_ext}/"

    # Create the directory if it does not exist
    os.makedirs(directory_path, exist_ok=True)

    # Generate the mp3 filename by setting the correct path within the new directory
    mp3_filename = f"{directory_path}{base_name_without_ext}.mp3"

    # Save the audio clip as an mp3 file
    audio_clip.write_audiofile(mp3_filename)

    # Close the clips
    video_clip.close()
    audio_clip.close()

    print(f"Audio extracted and saved as {mp3_filename}")

    return mp3_filename


%time full_audio_path = extract_audio_from_video("INSERT_VIDEO_PATH_HERE")

MoviePy - Writing audio in ./source_files/michael-ad-advisory_2024-03-26/michael-ad-advisory_2024-03-26.mp3


                                                                        

MoviePy - Done.
Audio extracted and saved as ./source_files/michael-ad-advisory_2024-03-26/michael-ad-advisory_2024-03-26.mp3
CPU times: user 7.34 s, sys: 1.04 s, total: 8.38 s
Wall time: 23.7 s




# Create a trimmed version of an audio file for testing purposes


In [4]:
def trim_audio(audio_path, duration=120):
    # Load the audio file
    audio_clip = mp.AudioFileClip(audio_path)

    # Trim the audio to the first 2 minutes (120 seconds)
    trimmed_clip = audio_clip.subclip(0, duration)

    # Generate the directory path for the trimmed file
    directory_path = os.path.dirname(audio_path)
    trimmed_directory = os.path.join(directory_path, "trimmed")

    # Create the trimmed directory if it does not exist
    os.makedirs(trimmed_directory, exist_ok=True)

    # Generate the trimmed filename by appending '_trimmed' before the extension
    base_name = os.path.basename(audio_path)
    trimmed_filename = base_name.replace(".mp3", "_trimmed.mp3")
    trimmed_file_path = os.path.join(trimmed_directory, trimmed_filename)

    # Save the trimmed audio clip
    trimmed_clip.write_audiofile(trimmed_file_path)

    # Close the clip
    trimmed_clip.close()

    print(f"Trimmed audio saved as {trimmed_file_path}")

    return trimmed_file_path


%time trimmed_path = trim_audio(full_audio_path, duration=120)

MoviePy - Writing audio in ./source_files/michael-ad-advisory_2024-03-26/trimmed/michael-ad-advisory_2024-03-26_trimmed.mp3


                                                                      

MoviePy - Done.
Trimmed audio saved as ./source_files/michael-ad-advisory_2024-03-26/trimmed/michael-ad-advisory_2024-03-26_trimmed.mp3
CPU times: user 320 ms, sys: 59.9 ms, total: 380 ms
Wall time: 966 ms




# Run WhisperX on the audio file to generate a transcript


In [5]:
import os
import time

# Retrieve the Hugging Face token from environment variables
hf_token = os.getenv('HUGGINGFACE_TOKEN')

# Choose the path for the audio file to process
# path = trimmed_path  # Use the trimmed audio path
path = full_audio_path  # Use the full audio path (comment out the line above and uncomment this line to use the full path)

# Extract the base directory name from the audio path to create a corresponding output directory
base_dir_name = os.path.basename(os.path.dirname(os.path.dirname(path)))
output_dir = os.path.join('./outputs', base_dir_name, 'trimmed')

# Create the output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# Run WhisperX with the specified parameters and output directory and track the time taken
start_time = time.time()

!whisperx $path --compute_type int8 --diarize --min_speakers 2 --max_speakers 3 --language en --hf_token $hf_token --output_dir $output_dir --print_progress True

end_time = time.time()
elapsed_time = end_time - start_time
print(f"WhisperX processing time: {int(elapsed_time // 3600)}h {(int(elapsed_time) % 3600) // 60}m {int(elapsed_time) % 60}s {int((elapsed_time - int(elapsed_time)) * 1000)}ms")


torchvision is not available - cannot save figures
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/whisperx-vad-segmentation.bin`
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
>>Performing transcription...
Progress: 25.00%...
Progress: 50.00%...
Progress: 75.00%...
Progress: 100.00%...
>>Performing alignment...
Progress: 25.00%...
Progress: 50.00%...
Progress: 75.00%...
Progress: 100.00%...
>>Performing diarization...
WhisperX processing time: 142.14 seconds
