In [None]:
# 1) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 2) Install dependencies (FFmpeg and Whisper)
!sudo apt-get update -qq
!sudo apt-get install -y ffmpeg
!pip install -q git+https://github.com/openai/whisper.git


In [None]:
# 3) Imports and GPU check for Whisper
import os
import subprocess
import torch
import whisper

# Check if a GPU is available for Whisper
gpu_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU detected"
device = "cuda" if gpu_available else "cpu"

print(f"GPU available for Whisper: {gpu_available}")
print(f"GPU name: {gpu_name}")
print(f"Whisper device: {device}")


In [None]:
def convert_videos_to_audio(input_folder, output_folder):
    """
    Convert video files in the supported formats to M4A audio using FFmpeg (CPU).
    We keep it on CPU because GPU acceleration for audio extraction is typically
    not a big performance boost, and can complicate dependencies.
    """
    os.makedirs(output_folder, exist_ok=True)

    supported_formats = (".mp4", ".mov", ".avi", ".mkv")

    for file in os.listdir(input_folder):
        if file.lower().endswith(supported_formats):
            input_path = os.path.join(input_folder, file)
            output_name = os.path.splitext(file)[0] + ".m4a"
            output_path = os.path.join(output_folder, output_name)

            print(f"Extracting audio from: {file}")

            # FFmpeg command (CPU-based)
            cmd = [
                "ffmpeg",
                "-i", input_path,
                "-vn",             # Disable video
                "-acodec", "aac",  # AAC encoder
                "-b:a", "192k",    # Audio bitrate
                output_path,
                "-y"               # Overwrite output if exists
            ]

            # Run the command, suppressing FFmpeg output
            subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    print("\n✅ Audio extraction completed! Check your output folder.")


In [None]:
def convert_videos_to_audio(input_folder, output_folder):
    """
    Convert video files in the supported formats to M4A audio using FFmpeg (CPU).
    We keep it on CPU because GPU acceleration for audio extraction is typically
    not a big performance boost, and can complicate dependencies.
    """
    os.makedirs(output_folder, exist_ok=True)

    supported_formats = (".mp4", ".mov", ".avi", ".mkv")

    for file in os.listdir(input_folder):
        if file.lower().endswith(supported_formats):
            input_path = os.path.join(input_folder, file)
            output_name = os.path.splitext(file)[0] + ".m4a"
            output_path = os.path.join(output_folder, output_name)

            print(f"Extracting audio from: {file}")

            # FFmpeg command (CPU-based)
            cmd = [
                "ffmpeg",
                "-i", input_path,
                "-vn",             # Disable video
                "-acodec", "aac",  # AAC encoder
                "-b:a", "192k",    # Audio bitrate
                output_path,
                "-y"               # Overwrite output if exists
            ]

            # Run the command, suppressing FFmpeg output
            subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    print("\n✅ Audio extraction completed! Check your output folder.")


In [None]:
def transcribe_audios(input_folder, output_folder, whisper_model="turbo", language=None):
    """
    Transcribe all .m4a audio files in the 'input_folder' using Whisper,
    then save the transcripts as .txt files in the 'output_folder'.

    Parameters:
    - whisper_model (str): the model name to load (e.g., 'turbo', 'tiny', 'base', etc.)
    - language (str or None): if you want to force a specific language (e.g. 'en', 'pt').
    """
    os.makedirs(output_folder, exist_ok=True)

    print("\nLoading Whisper model:", whisper_model)
    model = whisper.load_model(whisper_model, device=device)
    print(f"✅ Whisper '{whisper_model}' model loaded successfully!\n")

    for file_name in os.listdir(input_folder):
        if file_name.lower().endswith(".m4a"):
            input_path = os.path.join(input_folder, file_name)
            output_text_path = os.path.join(
                output_folder,
                file_name.replace(".m4a", ".txt")
            )

            print(f"Transcribing: {file_name} ...")

            # Transcribe using GPU if available, otherwise CPU
            if language:
                result = model.transcribe(input_path, language=language)
            else:
                result = model.transcribe(input_path)

            # Save transcript
            with open(output_text_path, "w", encoding="utf-8") as f:
                f.write(result["text"])

            print(f"Transcription saved to: {output_text_path}")

    print("\n✅ All transcriptions completed successfully!")


In [None]:
# 6) Example usage

video_folder = "/content/drive/My Drive/videos"          # Your video files path
audio_folder = "/content/drive/My Drive/audios"          # Where to store extracted audio
transcript_folder = "/content/drive/My Drive/transcripts"  # Where to store transcriptions

# Step A: Convert videos to audio (CPU for FFmpeg)
convert_videos_to_audio(video_folder, audio_folder)

# Step B: Transcribe using Whisper (GPU if available, otherwise CPU)
transcribe_audios(
    input_folder=audio_folder,
    output_folder=transcript_folder,
    whisper_model="turbo",  # Keep "turbo" as requested 
    language="en"           # Adjust to "en", "pt", or None for auto detection
)
